Imported Upstream version 1.10.0upstream/1.10.0 submit/tizen/20201028.104702 submit/tizen/20201028.031836 accepted/tizen/unified/20201029.124827

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-10-28 12:16:55 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-10-28 12:16:55 +0900
commit: c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (patch)
tree: 761ee8e171e5203f5c598ad93b2e7e0bc2e31aa2
parent: 74476a2d0296bdad70a2f7f90bc7419a8b05bffd (diff)
download: nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.gz
nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.bz2
nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.zip
922 files changed, 37320 insertions, 9277 deletions
diff --git a/Makefile.template b/Makefile.template
index 1b2f564c0..8e88e9092 100644
--- a/Makefile.template
+++ b/Makefile.template
@@ -129,12 +129,14 @@ configure_internal:
 ifneq ($(EXT_ACL_FOLDER),)
 	mkdir -p $(OVERLAY_FOLDER)/lib
 	cp $(EXT_ACL_FOLDER)/* $(OVERLAY_FOLDER)/lib
+# Make stamp file
+	printf "20.05" > $(OVERLAY_FOLDER)/ARMCOMPUTE.stamp
 endif
 
 	NNFW_WORKSPACE="$(WORKSPACE)" NNFW_INSTALL_PREFIX=$(INSTALL_PATH) ./nnfw configure \
 		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE_LC) \
 		-DNNFW_OVERLAY_DIR=$(OVERLAY_FOLDER) \
-		-DACL_BUILD_THREADS=$(NPROCS) \
+		-DEXTERNALS_BUILD_THREADS=$(NPROCS) \
 		$(OPTIONS)
 	touch $(TIMESTAMP_CONFIGURE)
 
@@ -151,17 +153,17 @@ install_internal:
 	touch $(TIMESTAMP_INSTALL)
 
 runtime_tar_internal: $(TIMESTAMP_BUILD) install_internal
-	tar -zcf $(WORKSPACE)/nnfw-package.tar.gz -C $(INSTALL_PATH) lib
-	tar -zcf $(WORKSPACE)/nnfw-devel-package.tar.gz -C $(INSTALL_PATH) include/nnfw
-	tar -zcf $(WORKSPACE)/nnfw-plugin-devel-package.tar.gz -C $(INSTALL_PATH) include/onert
-	tar -zcf $(WORKSPACE)/nnfw-test-package.tar.gz -C $(INSTALL_PATH) $(shell ls $(INSTALL_PATH) -I lib -I include)
+	tar -zcf $(WORKSPACE)/onert-package.tar.gz -C $(INSTALL_PATH) lib
+	tar -zcf $(WORKSPACE)/onert-devel-package.tar.gz -C $(INSTALL_PATH) include/nnfw
+	tar -zcf $(WORKSPACE)/onert-plugin-devel-package.tar.gz -C $(INSTALL_PATH) include/onert
+	tar -zcf $(WORKSPACE)/onert-test-package.tar.gz -C $(INSTALL_PATH) $(shell ls $(INSTALL_PATH) -I lib -I include)
 
 acl_tar_internal: $(BUILD_FOLDER)
-	tar -zcf $(WORKSPACE)/nnfw-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_core.so lib/libarm_compute_graph.so
+	tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_core.so lib/libarm_compute_graph.so
 
 install_internal_acl:
 # Workaround to install acl for test (ignore error when there is no file to copy)
-	cp $(OVERLAY_FOLDER)/lib/libarm_compute* $(INSTALL_ALIAS)/lib || true
+	cp $(OVERLAY_FOLDER)/lib/libarm_compute*.so $(INSTALL_ALIAS)/lib || true
 
 build_test_suite: install_internal install_internal_acl
 	@echo "packaging test suite"
diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt
index fcf01de7d..59b61a538 100644
--- a/compiler/bcq-tools/CMakeLists.txt
+++ b/compiler/bcq-tools/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(BCQ_TOOLS_FILES
+    generate_bcq_metadata
     generate_bcq_output_arrays
-    preserve_bcq_info
 )
 
 foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES})
diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md
index 18b0f4826..0acd0ba00 100644
--- a/compiler/bcq-tools/README.md
+++ b/compiler/bcq-tools/README.md
@@ -2,77 +2,69 @@
 
 This directory includes some tools related with BCQ.
 
-## preserve_bcq_info
+## generate_bcq_output_arrays
 
 ### Purpose
 
-`preserve_bcq_info` is for preserving constant nodes which include BCQ information.
-When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node.
-This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ.
-One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied.
-`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter.
-As a result, BCQ information will be preserved.
+To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
+However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
+`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
 
 ### How to use
 
 ```bash
-preserve_bcq_info \
+generate_bcq_output_arrays \
 --input_path /path/to/original_model.pb \
---output_path /path/to/preserved_model.pb
+--output_path /path/to/output_arrays.txt
 ```
 
 ### How it works
 
-If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example.
-
 ```
-[Original Constant Nodes]
-const(value=[1, 2, 3], name='const1')
-const(value=[1, 2, 3], name='const2')
-const(value=[1, 2, 3], name='const3')
-
-[After BCQ information preserved]
+[Original BCQ information nodes]
 const(value=[1, 2, 3, -1], name='const1')
 const(value=[1, 2, 3, -2], name='const2')
 const(value=[1, 2, 3, -3], name='const3')
-```
 
-For dummy values, negative values are used instead of positive values.
-This is because positive valus may be confused with original constant node values.
-For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes.
+[Generated output_arrays]
+,const1,const2,const3
+```
 
 ### Caution
 
-- Newly generated dummy values should be ignored when the constant nodes are used.
+- Generated output_arrays will be start with comma.
 
-## generate_bcq_output_arrays
+## generate_bcq_metadata
 
 ### Purpose
 
-To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
-However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
-`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
+`generate_bcq_metadata` is for appending metadata as output of a model which includes BCQ information.
+The appended metadata is used for connecting BCQ related operations and constant nodes.
 
 ### How to use
 
 ```bash
-generate_bcq_output_arrays \
+generate_bcq_metadata \
 --input_path /path/to/original_model.pb \
---output_path /path/to/output_arrays.txt
+--output_path /path/to/metadata_inserted_model.pb
+--output_arrays output1,output2,...,outputN
 ```
 
 ### How it works
 
+Metadata will be generated as following description.
 ```
-[Original BCQ information nodes]
-const(value=[1, 2, 3, -1], name='const1')
-const(value=[1, 2, 3, -2], name='const2')
-const(value=[1, 2, 3, -3], name='const3')
-
-[Generated output_arrays]
-,const1,const2,const3
+< Generated Metadata in BCQ version 1 >
+[0] Starting magic number                = {-2e9 + 27}
+[1] Version of BCQ                       = {1}
+[2] The number of original model outputs = {N | N > 0}
+[3] Bundle size                          = {7, 8}
+[4] Ending magic number                  = {2e9 - 27}
 ```
+- BCQ version 1
+    - Two magic numbers, starting and ending magic number, are used for indicating that the model includes BCQ metadata. To decrease value duplication probability, prime number is used and the value is inserted not only at the beginning but also at the end.
+    - The word **bundle** means that a set of BCQ information and BCQ applicable operation. If six BCQ information nodes are used for one operation, the six information nodes and the other one operation are packaged as **bundle**. Then, in this case, the bundle size will be 6 + 1 = 7.
 
 ### Caution
 
-- Generated output_arrays will be start with comma.
+- If there is no BCQ information in original model, any changes will be applied.
diff --git a/compiler/bcq-tools/generate_bcq_metadata b/compiler/bcq-tools/generate_bcq_metadata
new file mode 100644
index 000000000..8405556aa
--- /dev/null
+++ b/compiler/bcq-tools/generate_bcq_metadata
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import tensorflow as tf
+
+import argparse
+import sys
+
+ONE_START_MAGICNUM = int(-2e9 + 27)
+ONE_END_MAGICNUM = int(2e9 - 27)
+
+
+def _get_parser():
+    """
+    Returns an ArgumentParser for generating BCQ metadata.
+    """
+    parser = argparse.ArgumentParser(
+        description=("Command line tool to generate metadata of BCQ nodes"))
+
+    # Input and output path.
+    parser.add_argument(
+        "-i",
+        "--input_path",
+        type=str,
+        help="Full filepath of the input file.",
+        required=True)
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        type=str,
+        help="Full filepath of the output file.",
+        required=True)
+    parser.add_argument(
+        "-O",
+        "--output_arrays",
+        type=str,
+        help="Original model output arrays",
+        required=True)
+
+    return parser
+
+
+# This function is copied from
+# https://github.com/tensorflow/tensorflow/blob/r2.3/tensorflow/examples/label_image/label_image.py#L26
+def load_graph(model_file):
+    graph = tf.Graph()
+    graph_def = tf.compat.v1.GraphDef()
+
+    with open(model_file, "rb") as f:
+        graph_def.ParseFromString(f.read())
+    with graph.as_default():
+        tf.import_graph_def(graph_def, name="")
+
+    return graph
+
+
+def generate_metadata_header(original_graph, bcq_version, output_arrays):
+    # Generating metadata starts
+    metadata_values = np.array([ONE_START_MAGICNUM])
+
+    # Append BCQ version
+    metadata_values = np.append(metadata_values, bcq_version)
+
+    # Append original output count
+    output_cnt = output_arrays.count(',') + 1
+    metadata_values = np.append(metadata_values, output_cnt)
+
+    return metadata_values
+
+
+def generate_bcq_metadata_v1(flags):
+    """
+    BCQv1 contains following metadata.
+        - The number of each BCQ information set
+    """
+
+    is_valid = True
+    allowed_info_names = [
+        "bcqinfo_do_w_x", "bcqinfo_alpha", "bcqinfo_packed_binary_code",
+        "bcqinfo_number_of_clusters", "bcqinfo_size_of_clusters",
+        "bcqinfo_qbits_of_clusters", "bcqinfo_dequant_weight"
+    ]
+
+    original_graph = load_graph(flags.input_path)
+    original_graph_def = original_graph.as_graph_def()
+
+    prefix_infonames_dict = {}
+
+    for node in original_graph_def.node:
+        if node.op == "Const" and "/bcqinfo_" in node.name:
+            prefix_index = node.name.index("/bcqinfo_")
+            prefix = node.name[:prefix_index]
+            infoname = node.name[prefix_index + 1:]
+
+            if infoname not in allowed_info_names:
+                is_valid = False
+                break
+
+            if prefix not in prefix_infonames_dict:
+                prefix_infonames_dict[prefix] = set()
+
+            prefix_infonames_dict[prefix].add(infoname)
+
+    # All the number of BCQ information should be same
+    num_of_bcqinfo = -1
+    for key in prefix_infonames_dict:
+        infonames = prefix_infonames_dict[key]
+        if num_of_bcqinfo == -1:
+            num_of_bcqinfo = len(infonames)
+        elif num_of_bcqinfo != len(infonames):
+            is_valid = False
+
+    # The number of BCQv1 information should be 6 or 7
+    if num_of_bcqinfo != 6 and num_of_bcqinfo != 7:
+        is_valid = False
+
+    # If BCQ information is invalid, return original model
+    if is_valid == False:
+        return original_graph_def
+
+    new_graph_def = tf.compat.v1.GraphDef()
+    for node in original_graph_def.node:
+        new_node = new_graph_def.node.add()
+        new_node.CopyFrom(node)
+
+    # Generate metadata header
+    metadata_values = generate_metadata_header(original_graph, 1, flags.output_arrays)
+
+    # Append metadata of BCQv1
+    metadata_values = np.append(metadata_values, num_of_bcqinfo + 1)
+
+    # Finish generating metadata
+    metadata_values = np.append(metadata_values, ONE_END_MAGICNUM)
+
+    # Generate metadata tensor
+    metadata_tensor = tf.make_tensor_proto(metadata_values, tf.int32)
+
+    new_node = new_graph_def.node.add()
+    new_node.op = "Const"
+    new_node.name = "one_compiler/bcqinfo_one_metadata"
+    new_node.attr["dtype"].CopyFrom(
+        tf.core.framework.attr_value_pb2.AttrValue(type=tf.int32.as_datatype_enum))
+    new_node.attr["value"].tensor.CopyFrom(metadata_tensor)
+    return new_graph_def
+
+
+def determine_bcq_version(flags):
+    """
+    CAUTION : For now, BCQ has only one version and thus always returns 1 when BCQ
+    information nodes are included. If new BCQ version is introduced,
+    this function must be updated accordingly.
+
+    When BCQ information does not exist, -1 is returned.
+    """
+    bcq_version = -1
+
+    original_graph = load_graph(flags.input_path)
+    original_graph_def = original_graph.as_graph_def()
+
+    for node in original_graph_def.node:
+        if node.op == "Const" and "/bcqinfo_" in node.name:
+            bcq_version = 1
+            break
+
+    return bcq_version
+
+
+def generate_bcq_metadata(flags):
+    """
+    Basic format of metadata is as following.
+        - Magic number indicating start
+        - Version of BCQ Format
+        - The number of original outputs
+        - Metadata based on each BCQ format
+        - Magic number indicating end
+    """
+    program_version = 1
+    model_version = determine_bcq_version(flags)
+
+    if model_version == 1:
+        result_graph_def = generate_bcq_metadata_v1(flags)
+    elif model_version == -1:
+        # When there is no BCQ information, do nothing
+        result_graph_def = load_graph(flags.input_path)
+    else:
+        err_msg = "BCQ version of the model(v{}) ".format(model_version)
+        err_msg += "is higher than "
+        err_msg += "the version supported by this program(v{})".format(program_version)
+        raise SystemExit(err_msg)
+
+    tf.io.write_graph(result_graph_def, '.', flags.output_path, False)
+
+
+def main():
+    # Parse argument.
+    parser = _get_parser()
+    flags = parser.parse_known_args(args=sys.argv[1:])
+
+    # Generate a new pb file, which BCQ metadata is included.
+    generate_bcq_metadata(flags[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
index 48e8a9373..b71a37410 100644
--- a/compiler/bcq-tools/generate_bcq_output_arrays
+++ b/compiler/bcq-tools/generate_bcq_output_arrays
@@ -1,5 +1,20 @@
 #!/usr/bin/env python3
 
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import tensorflow as tf
 
 import argparse
@@ -21,61 +36,223 @@ def _get_parser():
         help="Full filepath of the input file.",
         required=True)
     parser.add_argument(
-        "-o",
-        "--output_path",
+        "-m",
+        "--metadata_path",
+        type=str,
+        help="Full filepath for the file that provides metadata.",
+        required=True)
+    parser.add_argument(
+        "-A",
+        "--output_arrays_path",
         type=str,
-        help="Full filepath of the output file.",
+        help="Full filepath for the file that provides output arrays",
         required=True)
 
     return parser
 
 
-def load_graph(frozen_graph_filename):
-    """
-    Load graph from frozen pb file
-    """
-    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
-        graph_def = tf.compat.v1.GraphDef()
+# This function is copied from
+# https://github.com/tensorflow/tensorflow/blob/r2.3/tensorflow/examples/label_image/label_image.py#L26
+def load_graph(model_file):
+    graph = tf.Graph()
+    graph_def = tf.compat.v1.GraphDef()
+
+    with open(model_file, "rb") as f:
         graph_def.ParseFromString(f.read())
-    with tf.Graph().as_default() as graph:
-        tf.import_graph_def(graph_def, name='')
+    with graph.as_default():
+        tf.import_graph_def(graph_def, name="")
+
     return graph
 
 
-def dtype2str(dtype):
-    if dtype == "int32":
-        return "TF_INT32"
-    elif dtype == "int64":
-        return "TF_INT64"
-    elif dtype == "float32":
-        return "TF_FLOAT"
-    elif dtype == "bool":
-        return "TF_BOOL"
-    else:
-        raise Exception("Not supported dtype")
+def find_bcq_version(flags):
+    """
+    If BCQ metadata exists, BCQ version is in the second element.
+    Return -1 when the metadata is not found.
+    """
+    graph = load_graph(flags.input_path)
+    graph_def = graph.as_graph_def()
+    for node in graph_def.node:
+        if node.op == "Const" and "one_compiler/bcqinfo_one_metadata" in node.name:
+            metadata_tensor = tf.make_ndarray(node.attr["value"].tensor)
+            return metadata_tensor[1]
+    return -1
+
+
+def print_bcqinfo_output_arrays_v1(flags):
+    """
+    This function generates a file which includes output arrays of BCQ v1
+    information bundles. Each bundle is consisted with one of candidate
+    operations (BCQ may be applied) and BCQ constant nodes related with
+    the operation.
+    """
+    graph = load_graph(flags.input_path)
+    graph_def = graph.as_graph_def()
+    ops = graph.get_operations()
 
+    # If there is a constant node named PREFIX_1/bcqinfo_alpha,
+    # it is used for applying BCQ to constant node named PREFIX_1.
+    # Collected prefixes will be used for connecting
+    # bcqinfo nodes and user operations of prefix nodes.
+    prefix_set = set()
+    has_dequant_weight = False
+    for op in ops:
+        if op.type == "Const" and "/bcqinfo_" in op.outputs[0].name:
+            # Metadata do not have prefix
+            if "one_compiler/bcqinfo_one_metadata" in op.outputs[0].name:
+                continue
 
-def print_output_arrays(flags):
-    graph_model = load_graph(flags.input_path)
-    graph_model_def = graph_model.as_graph_def()
-    ops = graph_model.get_operations()
+            prefix_index = op.outputs[0].name.index("/bcqinfo_")
+            prefix = op.outputs[0].name[:prefix_index]
+            prefix_set.add(prefix)
 
-    output_names = [op.outputs[0].name for op in ops 
-        if op.type == "Const" and "bcqinfo_" in op.outputs[0].name]
+            # Usually, output name of op is like "outputname:0"
+            # -2 is for removing ":0"
+            infoname = op.outputs[0].name[prefix_index + 1:-2]
+            if infoname == "bcqinfo_dequant_weight":
+                has_dequant_weight = True
 
-    output_arrays = ""    
-    for output_name in output_names:
-        output_arrays += ","
+    # Ideal situation is that the user nodes of BCQ applicable constant nodes
+    # are BCQ applicable operations such as MatMul, GatherV2, etc.
+    # However, operations which do not change original values such as
+    # Ideneity or Transpose can exist between them. In view of TensorFlow Lite,
+    # real user nodes of BCQ applicable constant nodes must be found first.
+    # This work is done by BFS search with queue.
 
-        colon_index = output_name.find(":")
-        if colon_index == -1:
-            output_arrays += output_name
-        else:
-            output_arrays += output_name[:colon_index]
+    prefix_node_dict = {}  # key : prefix / value : list of candidates
+    matmul_node_prefix_dict = {}  # key : Name of MatMul node / value : prefix
 
-    f = open(flags.output_path, 'w')
-    f.write(output_arrays)
-    f.close()
+    queue_prefix = list(prefix_set)
+    queue_nodename = [queue_prefix[idx] + ":0" for idx in range(len(queue_prefix))]
+
+    while len(queue_prefix) > 0:
+        prefix = queue_prefix.pop(0)
+        nodename = queue_nodename.pop(0)
+        if prefix not in prefix_node_dict.keys():
+            prefix_node_dict[prefix] = []
+
+        # Usually, output name of op is like "outputname:0"
+        # -2 is for removing ":0"
+        for op in ops:
+            if op.type == "MatMul" and (op.inputs[0].name == nodename
+                                        or op.inputs[1].name == nodename):
+                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
+                matmul_node_prefix_dict[op.outputs[0].name[:-2]] = prefix
+            elif op.type == "Einsum" and (op.inputs[0].name == nodename
+                                          or op.inputs[1].name == nodename):
+                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
+            elif op.type == "GatherV2" and op.inputs[0].name == nodename:
+                prefix_node_dict[prefix].append(op.outputs[0].name[:-2])
+            elif len(op.outputs) == 1:
+                for i in range(len(op.inputs)):
+                    if op.inputs[i].name == nodename:
+                        queue_prefix.append(prefix)
+                        queue_nodename.append(op.outputs[0].name)
+                        break
+
+    # When TensorFlow model is converted to TensorFlow Lite model,
+    # more than one operation can be fused as one.
+    # For example, MatMul + BiasAdd + ReLU in TensorFlow can be fused as
+    # one FullyConnected in TensorFlow Lite.
+    # It means that even real user nodes of BCQ applicable constant nodes
+    # in TensorFlow are found, they may be real user nodes in TensorFlow Lite.
+    # Therefore additional candidates of real user nodes should be found either.
+    # Finding additional candidates is done by BFS search with queue.
+
+    fuseop_prefix_dict = {}  # key : Candidate operation / Value : prefix
+
+    # These ops can be candidate. However other candidates may exists after these ops.
+    mark_type = ["Add", "AddV2", "BiasAdd", "Reshape", "Transpose"]
+
+    # These ops can be candidate. And no more candidates will be found after these ops.
+    mark_and_stop_type = ["Relu", "Relu6", "Tanh"]
+
+    # These ops cannot be candidates but other candidates may exists after these ops.
+    # NOTE : Some of following ops may be removed from the list but not sure for now.
+    pass_type = [
+        "BatchToSpaceND", "Cast", "DepthToSpace", "ExpandDims", "ResizeBilinear",
+        "ResizeNearestNeighbor", "ScatterNd", "SpaceToBatchND", "SpaceToDepth", "Squeeze",
+        "Identity", "Pack", "Unpack", "Stack"
+    ]
+
+    queue_prefix = list(matmul_node_prefix_dict.values())
+    queue_nodename = [matmul + ":0" for matmul in matmul_node_prefix_dict.keys()]
+
+    visited_nodes = set(queue_nodename)
+    while len(queue_prefix) > 0:
+        prefix = queue_prefix.pop(0)
+        nodename = queue_nodename.pop(0)
+
+        # Usually, output name of op is like "outputname:0"
+        # -2 is for removing ":0"
+        for op in ops:
+            for i in range(len(op.inputs)):
+                if nodename == op.inputs[i].name:
+                    if op.type in mark_type:
+                        if op.outputs[0].name[:-2] not in fuseop_prefix_dict.keys():
+                            fuseop_prefix_dict[op.outputs[0].name[:-2]] = set()
+                        fuseop_prefix_dict[op.outputs[0].name[:-2]].add(prefix)
+                        if op.outputs[0].name not in visited_nodes:
+                            queue_prefix.append(prefix)
+                            queue_nodename.append(op.outputs[0].name)
+                            visited_nodes.add(op.outputs[0].name)
+                    elif op.type in mark_and_stop_type:
+                        if op.outputs[0].name[:-2] not in fuseop_prefix_dict.keys():
+                            fuseop_prefix_dict[op.outputs[0].name[:-2]] = set()
+                        fuseop_prefix_dict[op.outputs[0].name[:-2]].add(prefix)
+                    elif op.type in pass_type and op.outputs[0].name not in visited_nodes:
+                        queue_prefix.append(prefix)
+                        queue_nodename.append(op.outputs[0].name)
+                        visited_nodes.add(op.outputs[0].name)
+
+    # Write the name of metadata node
+    with open(flags.metadata_path, 'w') as f_metadata:
+        f_metadata.write("one_compiler/bcqinfo_one_metadata,")
+
+    # Write all pairs of candidate operations and related BCQ information nodes.
+    with open(flags.output_arrays_path, 'w') as f_arrays:
+        for prefix in prefix_set:
+            for fusable_op in prefix_node_dict[prefix]:
+                f_arrays.write("," + prefix + "/bcqinfo_do_w_x")
+                f_arrays.write("," + prefix + "/bcqinfo_alpha")
+                f_arrays.write("," + prefix + "/bcqinfo_packed_binary_code")
+                f_arrays.write("," + prefix + "/bcqinfo_number_of_clusters")
+                f_arrays.write("," + prefix + "/bcqinfo_size_of_clusters")
+                f_arrays.write("," + prefix + "/bcqinfo_qbits_of_clusters")
+                f_arrays.write("," + fusable_op)
+                if has_dequant_weight:
+                    f_arrays.write("," + prefix + "/bcqinfo_dequant_weight")
+        for fuseop in fuseop_prefix_dict.keys():
+            if len(fuseop_prefix_dict[fuseop]) == 1:
+                prefix = fuseop_prefix_dict[fuseop].pop()
+                f_arrays.write("," + prefix + "/bcqinfo_do_w_x")
+                f_arrays.write("," + prefix + "/bcqinfo_alpha")
+                f_arrays.write("," + prefix + "/bcqinfo_packed_binary_code")
+                f_arrays.write("," + prefix + "/bcqinfo_number_of_clusters")
+                f_arrays.write("," + prefix + "/bcqinfo_size_of_clusters")
+                f_arrays.write("," + prefix + "/bcqinfo_qbits_of_clusters")
+                f_arrays.write("," + fuseop)
+                if has_dequant_weight:
+                    f_arrays.write("," + prefix + "/bcqinfo_dequant_weight")
+
+
+def print_bcq_output_arrays(flags):
+    program_version = 1
+    model_version = find_bcq_version(flags)
+    
+    if model_version == 1:
+        print_bcqinfo_output_arrays_v1(flags)
+    elif model_version == -1:
+        # When BCQ information not found, print nothing.
+        f_metadata = open(flags.metadata_path, 'w')
+        f_arrays = open(flags.output_arrays_path, 'w')
+        f_metadata.close()
+        f_arrays.close()
+    else:
+        err_msg = "BCQ version of the model(v{}) ".format(model_version)
+        err_msg += "is higher than "
+        err_msg += "the version supported by this program(v{})".format(program_version)
+        raise SystemExit(err_msg)
 
 
 def main():
@@ -83,7 +260,7 @@ def main():
     parser = _get_parser()
     flags = parser.parse_known_args(args=sys.argv[1:])
 
-    print_output_arrays(flags[0])
+    print_bcq_output_arrays(flags[0])
 
 
 if __name__ == "__main__":
diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info
deleted file mode 100644
index 2ede8d4d0..000000000
--- a/compiler/bcq-tools/preserve_bcq_info
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-
-import tensorflow as tf
-import numpy as np
-
-import argparse
-import sys
-
-
-def _get_parser():
-    """
-    Returns an ArgumentParser for preserving BCQ information.
-    """
-    parser = argparse.ArgumentParser(
-        description=("Command line tool to preserve BCQ information"))
-
-    # Input and output path.
-    parser.add_argument(
-        "-i",
-        "--input_path",
-        type=str,
-        help="Full filepath of the input file.",
-        required=True)
-    parser.add_argument(
-        "-o",
-        "--output_path",
-        type=str,
-        help="Full filepath of the output file.",
-        required=True)
-
-    return parser
-
-
-def load_graph(frozen_graph_filename):
-    """
-    Load graph from frozen pb file
-    """
-    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
-        graph_def = tf.compat.v1.GraphDef()
-        graph_def.ParseFromString(f.read())
-    with tf.Graph().as_default() as graph:
-        tf.import_graph_def(graph_def, name='')
-    return graph
-
-
-def preserve_bcq_info(flags):
-    """
-    Generate unique dummy value from -1 to -N.
-
-    We use negative values to preserve BCQ information because
-    positive values may cause some confusion with real BCQ information values.
-    """
-
-    class UniqueValueGen:
-        def __init__(self):
-            self.unique_value = -1
-
-        def gen(self):
-            val = self.unique_value
-            self.unique_value = val - 1
-            return val
-
-    unique_value = UniqueValueGen()
-
-    original_graph_model = load_graph(flags.input_path)
-    original_graph_model_def = original_graph_model.as_graph_def()
-
-    new_graph = tf.compat.v1.GraphDef()
-    substitution_dict = {}
-
-    DT_INT32 = None  # Just for copying DT_INT32 attribute value
-
-    for node in original_graph_model_def.node:
-        if node.op == "Const":
-            # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end.
-            # Therefore we should convert the type to INT32 type.
-            if "/bcqinfo_do_w_x" in node.name:
-                original_tensor = tf.make_ndarray(node.attr["value"].tensor)
-                substitution_dict[node.name] = tf.make_tensor_proto(
-                    [int(original_tensor[0]), unique_value.gen()], tf.int32)
-
-            preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters", 
-                "/bcqinfo_qbits_of_clusters"]
-
-            if any(name in node.name for name in preserved_bcqinfo_list):
-                original_tensor = tf.make_ndarray(
-                    node.attr["value"].tensor)  # variable name change
-                substitution_dict[node.name] = tf.make_tensor_proto(
-                    np.append(original_tensor, unique_value.gen()), tf.int32)
-                DT_INT32 = node.attr["dtype"]
-
-    for node in original_graph_model_def.node:
-        if node.name in substitution_dict:
-            new_node = new_graph.node.add()
-            new_node.op = "Const"
-            new_node.name = node.name
-            new_node.attr["dtype"].CopyFrom(DT_INT32)
-            new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name])
-        else:
-            new_node = new_graph.node.add()
-            new_node.CopyFrom(node)
-
-    tf.io.write_graph(new_graph, '.', flags.output_path, False)
-
-
-def main():
-    # Parse argument.
-    parser = _get_parser()
-    flags = parser.parse_known_args(args=sys.argv[1:])
-
-    # Generate a new pb file, which BCQ information is preserved.
-    preserve_bcq_info(flags[0])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
index a8d32564f..dee2f3620 100644
--- a/compiler/circle-tensordump/src/Dump.cpp
+++ b/compiler/circle-tensordump/src/Dump.cpp
@@ -182,6 +182,10 @@ H5::PredType hdf5_dtype_cast(const circle::TensorType &circle_type)
     {
       return H5::PredType::NATIVE_UINT8;
     }
+    case circle::TensorType_INT16:
+    {
+      return H5::PredType::NATIVE_INT16;
+    }
     case circle::TensorType_INT32:
     {
       return H5::PredType::NATIVE_INT32;
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 302c3a796..08fc4ec00 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -10,7 +10,11 @@
 
 ## TFLITE RECIPE
 
+Add(Net_TConv_Add_000 PASS fuse_add_with_tconv)
+Add(Net_TConv_Add_001 PASS fuse_add_with_tconv)
+Add(Net_TConv_Add_002 PASS fuse_add_with_tconv)
 Add(Net_TConv_BN_000 PASS fuse_batchnorm_with_tconv)
+Add(Net_TConv_BN_001 PASS fuse_batchnorm_with_tconv)
 Add(Net_InstanceNorm_001 PASS fuse_instnorm)
 Add(Net_InstanceNorm_002 PASS fuse_instnorm)
 Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 39ceade3a..940d01e36 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -60,6 +60,18 @@ int entry(int argc, char **argv)
   arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
       "Enable all optimize options");
 
+  arser.add_argument("--fold_dequantize")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("This will fold dequantize op");
+
+  arser.add_argument("--fuse_add_with_tconv")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("This will fuse Add operator to Transposed Convolution operator");
+
   arser.add_argument("--fuse_batchnorm_with_tconv")
       .nargs(0)
       .required(false)
@@ -111,6 +123,41 @@ int entry(int argc, char **argv)
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
 
+  // sparsification argument
+  arser.add_argument("--sparsify_tensor")
+      .nargs(1)
+      .type(arser::DataType::STR)
+      .required(false)
+      .help("Tensor name that you want to sparsify");
+
+  arser.add_argument("--sparsify_traversal_order")
+      .nargs(1)
+      .type(arser::DataType::STR)
+      .required(false)
+      .default_value("0,1,2,3")
+      .help("Traversal order of dimensions. Default value: 0,1,2,3");
+
+  arser.add_argument("--sparsify_format")
+      .nargs(1)
+      .type(arser::DataType::STR)
+      .required(false)
+      .default_value("d,s")
+      .help("Format of each dimension. 'd' stands for dense, 's' stands for sparse(CSR). Default "
+            "value: d,s");
+
+  arser.add_argument("--sparsify_block_size")
+      .nargs(1)
+      .type(arser::DataType::STR)
+      .required(false)
+      .help("Size of each block dimension");
+
+  arser.add_argument("--sparsify_block_map")
+      .nargs(1)
+      .type(arser::DataType::STR)
+      .required(false)
+      .default_value("0,1")
+      .help("Map from block dimension to the original tensor dimension. Default value: 0,1");
+
   try
   {
     arser.parse(argc, argv);
@@ -130,6 +177,10 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::ResolveCustomOpBatchMatMul);
     options->enable(Algorithms::ResolveCustomOpMatMul);
   }
+  if (arser.get<bool>("--fold_dequantize"))
+    options->enable(Algorithms::FoldDequantize);
+  if (arser.get<bool>("--fuse_add_with_tconv"))
+    options->enable(Algorithms::FuseAddWithTConv);
   if (arser.get<bool>("--fuse_batchnorm_with_tconv"))
     options->enable(Algorithms::FuseBatchNormWithTConv);
   if (arser.get<bool>("--fuse_bcq"))
@@ -151,6 +202,27 @@ int entry(int argc, char **argv)
   std::string input_path = arser.get<std::string>("input");
   std::string output_path = arser.get<std::string>("output");
 
+  if (arser["--sparsify_tensor"])
+  {
+    options->enable(Algorithms::SparsifyTensorPass);
+    options->param(AlgorithmParameters::Sparsify_tensor_name,
+                   arser.get<std::string>("--sparsify_tensor"));
+    options->param(AlgorithmParameters::Sparsify_traversal_order,
+                   arser.get<std::string>("--sparsify_traversal_order"));
+    options->param(AlgorithmParameters::Sparsify_format,
+                   arser.get<std::string>("--sparsify_format"));
+    if (arser["--sparsify_block_size"])
+      options->param(AlgorithmParameters::Sparsify_block_size,
+                     arser.get<std::string>("--sparsify_block_size"));
+    else
+    {
+      std::cerr << "ERROR: Block size not provided" << std::endl;
+      return 255;
+    }
+    options->param(AlgorithmParameters::Sparsify_block_map,
+                   arser.get<std::string>("--sparsify_block_map"));
+  }
+
   // Load model from the file
   foder::FileLoader file_loader{input_path};
   std::vector<char> model_data;
@@ -189,6 +261,7 @@ int entry(int argc, char **argv)
 
     // call luci optimizations
     optimizer.optimize(graph);
+    optimizer.sparsify(graph);
 
     if (!luci::validate(graph))
     {
diff --git a/compiler/circledump/src/Dump.cpp b/compiler/circledump/src/Dump.cpp
index c695b0721..f8e2d61f3 100644
--- a/compiler/circledump/src/Dump.cpp
+++ b/compiler/circledump/src/Dump.cpp
@@ -73,10 +73,34 @@ std::ostream &operator<<(std::ostream &os, const std::vector<int32_t> &vect)
   return os;
 }
 
-template <typename T> void dump_fbvect(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
+template <typename T>
+void dump_fbvect(std::ostream &os, const flatbuffers::Vector<T> *fbvect, uint32_t size)
+{
+  for (uint32_t q = 0; q < size; q++)
+  {
+    if (q)
+      os << ", ";
+    os << fbvect->Get(q);
+  }
+}
+
+template <>
+void dump_fbvect(std::ostream &os, const flatbuffers::Vector<uint8_t> *fbvect, uint32_t size)
+{
+  assert(fbvect);
+  for (uint32_t q = 0; q < size; q++)
+  {
+    if (q)
+      os << ", ";
+    os << static_cast<uint32_t>(fbvect->Get(q));
+  }
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
 {
   if (fbvect == nullptr)
-    return;
+    return os;
 
   bool ellipsis = (fbvect->size() > 4);
   auto limit_size = ellipsis ? 4 : fbvect->size();
@@ -85,22 +109,14 @@ template <typename T> void dump_fbvect(std::ostream &os, const flatbuffers::Vect
   {
     os << "(" << fbvect->size() << ") ";
   }
-  for (uint32_t q = 0; q < limit_size; q++)
-  {
-    if (q)
-      os << ", ";
-    os << fbvect->Get(q);
-  }
+
+  dump_fbvect(os, fbvect, limit_size);
+
   if (ellipsis)
   {
     os << " ... ";
   }
-}
 
-template <typename T>
-std::ostream &operator<<(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
-{
-  dump_fbvect(os, fbvect);
   return os;
 }
 
@@ -182,8 +198,90 @@ void dump_sub_graph(std::ostream &os, circleread::Reader &reader)
         os << std::endl;
       }
     }
+
+    if (const auto &s_params = tensor->sparsity())
+    {
+      std::string strsparsity = "    Sparsity: ";
+      std::string strsindent(strsparsity.size(), ' ');
+      os << strsparsity;
+
+      if (s_params->traversal_order())
+      {
+        os << "traversal_order(" << s_params->traversal_order() << ") ";
+        os << std::endl << strsindent;
+      }
+      if (s_params->block_map())
+      {
+        os << "block_map(" << s_params->block_map() << ") ";
+        os << std::endl << strsindent;
+      }
+      if (const auto &dim_metadata = s_params->dim_metadata())
+      {
+        uint32_t idx = 0;
+        for (const auto &dm : *dim_metadata)
+        {
+          std::string strdm = "dim_metadata[" + std::to_string(idx++) + "]: ";
+          std::string strdm_indent = strsindent + std::string(strdm.size(), ' ');
+          os << strdm;
+
+          os << "format(" << circle::EnumNameDimensionType(dm->format()) << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "dense_size(" << dm->dense_size() << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "array_segments_type("
+             << circle::EnumNameSparseIndexVector(dm->array_segments_type()) << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "array_segments(";
+          switch (dm->array_segments_type())
+          {
+            case circle::SparseIndexVector_NONE:
+              // DO NOTHING
+              break;
+            case circle::SparseIndexVector_Int32Vector:
+              os << dm->array_segments_as_Int32Vector()->values();
+              break;
+            case circle::SparseIndexVector_Uint16Vector:
+              os << dm->array_segments_as_Uint16Vector()->values();
+              break;
+            case circle::SparseIndexVector_Uint8Vector:
+              os << dm->array_segments_as_Uint8Vector()->values();
+              break;
+            default:
+              throw std::runtime_error("Invalid SparseIndexVector type of array_segments");
+          }
+          os << ")" << std::endl << strdm_indent;
+
+          os << "array_indices_type(" << circle::EnumNameSparseIndexVector(dm->array_indices_type())
+             << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "array_indices(";
+          switch (dm->array_indices_type())
+          {
+            case circle::SparseIndexVector_NONE:
+              // DO NOTHING
+              break;
+            case circle::SparseIndexVector_Int32Vector:
+              os << dm->array_indices_as_Int32Vector()->values();
+              break;
+            case circle::SparseIndexVector_Uint16Vector:
+              os << dm->array_indices_as_Uint16Vector()->values();
+              break;
+            case circle::SparseIndexVector_Uint8Vector:
+              os << dm->array_indices_as_Uint8Vector()->values();
+              break;
+            default:
+              throw std::runtime_error("Invalid SparseIndexVector type of array_indices");
+          }
+          os << ")" << std::endl << strsindent;
+        }
+      }
+    }
+    os << std::endl;
   }
-  os << std::endl;
 
   // dump operators
   os << "Operators: O(subgraph index : operator index) OpCodeName " << std::endl;
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index a0a063e79..ef22baaee 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -593,6 +593,25 @@ public:
   }
 };
 
+class UnidirectionalSequenceLSTMPrinter : public OpPrinter
+{
+public:
+  void options(const circle::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_UnidirectionalSequenceLSTMOptions())
+    {
+      os << "    ";
+      os << "Activation(" << EnumNameActivationFunctionType(params->fused_activation_function())
+         << ") ";
+      os << "cell_clip(" << params->cell_clip() << ") ";
+      os << "proj_clip(" << params->proj_clip() << ") ";
+      os << "time_major(" << params->time_major() << ") ";
+      os << "asymmetric_quantize_inputs(" << params->asymmetric_quantize_inputs() << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 class UniquePrinter : public OpPrinter
 {
 public:
@@ -707,6 +726,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_CONV_2D] = make_unique<Conv2DPrinter>();
   _op_map[circle::BuiltinOperator_DEPTH_TO_SPACE] = make_unique<DepthToSpacePrinter>();
   _op_map[circle::BuiltinOperator_DEPTHWISE_CONV_2D] = make_unique<DepthwiseConv2DPrinter>();
+  // There is no Option for DEQUANTIZE
   _op_map[circle::BuiltinOperator_DIV] = make_unique<DivPrinter>();
   // There is no Option for FLOOR
   // There is no Option for FLOOR_MOD
@@ -761,6 +781,8 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
   _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
   // There is no Option for TOPK_V2
+  _op_map[circle::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM] =
+      make_unique<UnidirectionalSequenceLSTMPrinter>();
   _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
   _op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
   _op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index 886f607cf..e64859f06 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -5,12 +5,7 @@
 
 #[[ optimize : Exclude from circle optimization(circle2circle) ]]
 ## TensorFlowLiteRecipes
-optimize(Unique_000)
-optimize(Unique_001)
-optimize(Unique_002)
-optimize(Unique_003)
-optimize(Unique_U8_000)
-optimize(Unique_U8_001)
+optimize(UnidirectionalSequenceLSTM_001) # This recipe contains is_variable Tensor
 
 ## CircleRecipes
 
@@ -45,13 +40,12 @@ tcgenerate(Conv2D_003) # runtime doesn't support dilation
 tcgenerate(Conv2D_U8_000)
 tcgenerate(Conv2D_U8_001)
 tcgenerate(Cos_000)
-tcgenerate(DepthToSpace_000)
 tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_U8_000)
 tcgenerate(DepthwiseConv2D_U8_001)  # luci-interpreter doesn't support channel-wise quantization yet
+tcgenerate(Dequantize_000)  # runtime and luci-interpreter doesn't support Dequantize op yet
 tcgenerate(Div_000)
-tcgenerate(ELU_000)
 tcgenerate(Equal_000)
 tcgenerate(Exp_000)
 tcgenerate(ExpandDims_000)
@@ -74,13 +68,9 @@ tcgenerate(Greater_000)
 tcgenerate(GreaterEqual_000)
 tcgenerate(If_000)
 tcgenerate(If_001)
-tcgenerate(L2Normalize_000) # runtime doesn't support
-tcgenerate(L2Pool2D_000) # runtime doesn't support
 tcgenerate(L2Pool2D_U8_000)
-tcgenerate(LeakyRelu_000) # runtime doesn't support
 tcgenerate(Less_000)
 tcgenerate(LessEqual_000)
-tcgenerate(LocalResponseNormalization_000) # runtime doesn't support
 tcgenerate(Log_000)
 tcgenerate(LogicalAnd_000)
 tcgenerate(LogicalNot_000)
@@ -115,7 +105,6 @@ tcgenerate(Pack_U8_000)
 tcgenerate(Pad_U8_000)
 tcgenerate(PadV2_000)
 tcgenerate(Pow_000)
-tcgenerate(PRelu_000)
 tcgenerate(Range_000)
 tcgenerate(Rank_000)
 tcgenerate(ReduceAny_000)
@@ -139,7 +128,6 @@ tcgenerate(ResizeNearestNeighbor_000)
 tcgenerate(ReverseSequence_000)
 tcgenerate(ReverseV2_000)
 tcgenerate(Round_000)
-tcgenerate(Rsqrt_000)
 tcgenerate(ScatterNd_000)
 tcgenerate(SegmentSum_000)
 tcgenerate(Select_000)
@@ -150,32 +138,26 @@ tcgenerate(SelectV2_001)
 tcgenerate(SelectV2_002)
 tcgenerate(Shape_000)
 tcgenerate(Sin_000)
-tcgenerate(Slice_000)
 tcgenerate(Softmax_U8_000)
 tcgenerate(SpaceToBatchND_000)
 tcgenerate(SpaceToBatchND_001)
 tcgenerate(SpaceToBatchND_002)
 tcgenerate(SpaceToBatchND_003)
-tcgenerate(SpaceToDepth_000)
 tcgenerate(SparseToDense_000)
 tcgenerate(SplitV_000)
-tcgenerate(Sqrt_000)
 tcgenerate(Square_000)
 tcgenerate(SquaredDifference_000)
-tcgenerate(Squeeze_000)
-tcgenerate(StridedSlice_000)
-tcgenerate(StridedSlice_001)
-tcgenerate(StridedSlice_002)
 tcgenerate(Sub_000)
 tcgenerate(Sub_001)
 tcgenerate(Sub_U8_000)
 tcgenerate(Sum_000)
 tcgenerate(Sum_001)
-tcgenerate(Tanh_000)
 tcgenerate(Tile_000)
 tcgenerate(Tile_U8_000)
 tcgenerate(TopKV2_000)
 tcgenerate(TopKV2_001)
+tcgenerate(UnidirectionalSequenceLSTM_000) # runtime and luci-interpreter doesn't support UnidirectionalSequenceLSTM op yet
+tcgenerate(UnidirectionalSequenceLSTM_001) # runtime and luci-interpreter doesn't support UnidirectionalSequenceLSTM op yet
 tcgenerate(Unique_000)
 tcgenerate(Unique_001)
 tcgenerate(Unique_002)
diff --git a/compiler/loco/src/IR/Nodes.test.cpp b/compiler/loco/src/IR/Nodes.test.cpp
index 0b2210357..bd1c74253 100644
--- a/compiler/loco/src/IR/Nodes.test.cpp
+++ b/compiler/loco/src/IR/Nodes.test.cpp
@@ -523,11 +523,11 @@ TEST(TensorBroadcastTest, mapping)
 {
   loco::TensorBroadcast tensor_broadcast_node;
 
-  ASSERT_EQ(false, tensor_broadcast_node.mapping()->defined(0));
+  ASSERT_FALSE(tensor_broadcast_node.mapping()->defined(0));
 
   tensor_broadcast_node.mapping()->dim(0) = 3;
 
-  ASSERT_EQ(true, tensor_broadcast_node.mapping()->defined(0));
+  ASSERT_TRUE(tensor_broadcast_node.mapping()->defined(0));
   ASSERT_EQ(3, tensor_broadcast_node.mapping()->dim(0));
 }
 
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index 65d119761..06926cdc1 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -72,6 +72,11 @@ struct DepthwiseConv2DParams
   Activation activation;
 };
 
+struct DivParams
+{
+  Activation activation;
+};
+
 struct FullyConnectedParams
 {
   Activation activation;
@@ -115,6 +120,23 @@ struct ReducerParams
   bool keep_dims;
 };
 
+struct ResizeBilinearParams
+{
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
+struct ResizeNearestNeighborParams
+{
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
+struct SubParams
+{
+  Activation activation;
+};
+
 struct SpaceToDepthParams
 {
   int block_size;
diff --git a/compiler/luci-interpreter/src/kernels/Add.cpp b/compiler/luci-interpreter/src/kernels/Add.cpp
index 9ed155e94..8d119d516 100644
--- a/compiler/luci-interpreter/src/kernels/Add.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Add.h"
 
+#include "kernels/BinaryOpCommon.h"
 #include "kernels/Utils.h"
 
 #include <tensorflow/lite/kernels/internal/reference/add.h>
@@ -36,10 +37,13 @@ Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddPa
 
 void Add::configure()
 {
-  if (input1()->element_type() != input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  if (input1()->element_type() == DataType::S16)
   {
-    throw std::runtime_error("Input Tensor Data Type Mismatch.");
+    LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 &&
+                           output()->zero_point() == 0);
   }
+
   output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
 }
 
@@ -53,6 +57,9 @@ void Add::execute() const
     case DataType::U8:
       evalQuantized();
       break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
@@ -140,5 +147,49 @@ void Add::evalQuantized() const
   }
 }
 
+void Add::evalQuantizedS16() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  constexpr int left_shift = 12;
+  const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
+  const double real_input1_multiplier = input1_scale / twice_max_input_scale;
+  const double real_input2_multiplier = input2_scale / twice_max_input_scale;
+  const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
+
+  int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
+  int input1_shift{}, input2_shift{}, output_shift{};
+  quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
+  quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
+  quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  auto fn = [input1_multiplier, input1_shift, //
+             input2_multiplier, input2_shift, //
+             output_multiplier, output_shift, //
+             activation_min, activation_max](int16_t input1_val, int16_t input2_val) {
+    const int32_t shifted_input1_val = static_cast<int32_t>(input1_val) << left_shift;
+    const int32_t shifted_input2_val = static_cast<int32_t>(input2_val) << left_shift;
+    const int32_t scaled_input1_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input1_val, input1_multiplier, input1_shift);
+    const int32_t scaled_input2_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        raw_sum, output_multiplier, output_shift);
+    const int32_t clamped_output = std::min(activation_max, std::max(activation_min, raw_output));
+    return static_cast<int16_t>(clamped_output);
+  };
+
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<int16_t>(input1()),
+                        getTensorShape(input2()), getTensorData<int16_t>(input2()),
+                        getTensorShape(output()), getTensorData<int16_t>(output()), fn);
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Add.h b/compiler/luci-interpreter/src/kernels/Add.h
index a1f7e0406..79518845d 100644
--- a/compiler/luci-interpreter/src/kernels/Add.h
+++ b/compiler/luci-interpreter/src/kernels/Add.h
@@ -40,6 +40,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalQuantizedS16() const;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Add.test.cpp b/compiler/luci-interpreter/src/kernels/Add.test.cpp
index 705b648c8..de8a3bbb0 100644
--- a/compiler/luci-interpreter/src/kernels/Add.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.test.cpp
@@ -57,18 +57,10 @@ TEST(AddTest, Uint8)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
   for (int i = 0; i < output_data.size(); i++)
   {
-    Tensor input1_tensor{
-        getElementType<uint8_t>(), base_shape, {{quant_param.first}, {quant_param.second}}, ""};
-    Tensor input2_tensor{
-        getElementType<uint8_t>(), test_shapes[i], {{quant_param.first}, {quant_param.second}}, ""};
-    std::vector<uint8_t> quantized_input1_value =
-        quantize<uint8_t>(base_data, quant_param.first, quant_param.second);
-    std::vector<uint8_t> quantized_input2_value =
-        quantize<uint8_t>(test_data, quant_param.first, quant_param.second);
-    input1_tensor.writeData(quantized_input1_value.data(),
-                            quantized_input1_value.size() * sizeof(uint8_t));
-    input2_tensor.writeData(quantized_input2_value.data(),
-                            quantized_input2_value.size() * sizeof(uint8_t));
+    Tensor input1_tensor =
+        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
+                                                         quant_param.second, test_data);
     Tensor output_tensor =
         makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
@@ -79,26 +71,17 @@ TEST(AddTest, Uint8)
     kernel.configure();
     kernel.execute();
 
-    EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor),
-                                    output_tensor.scale(), output_tensor.zero_point()),
-                ElementsAreArray(ArrayFloatNear(output_data[i], kQuantizedTolerance)));
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
     EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
   }
   // Re-run with exchanged inputs.
   for (int i = 0; i < output_data.size(); i++)
   {
-    Tensor input1_tensor{
-        getElementType<uint8_t>(), test_shapes[i], {{quant_param.first}, {quant_param.second}}, ""};
-    Tensor input2_tensor{
-        getElementType<uint8_t>(), base_shape, {{quant_param.first}, {quant_param.second}}, ""};
-    std::vector<uint8_t> quantized_input1_value =
-        quantize<uint8_t>(test_data, quant_param.first, quant_param.second);
-    std::vector<uint8_t> quantized_input2_value =
-        quantize<uint8_t>(base_data, quant_param.first, quant_param.second);
-    input1_tensor.writeData(quantized_input1_value.data(),
-                            quantized_input1_value.size() * sizeof(uint8_t));
-    input2_tensor.writeData(quantized_input2_value.data(),
-                            quantized_input2_value.size() * sizeof(uint8_t));
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
+                                                         quant_param.second, test_data);
+    Tensor input2_tensor =
+        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
     Tensor output_tensor =
         makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
@@ -109,9 +92,8 @@ TEST(AddTest, Uint8)
     kernel.configure();
     kernel.execute();
 
-    EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor),
-                                    output_tensor.scale(), output_tensor.zero_point()),
-                ElementsAreArray(ArrayFloatNear(output_data[i], kQuantizedTolerance)));
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
     EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
   }
 }
@@ -145,8 +127,7 @@ TEST(AddTest, Float)
     kernel.configure();
     kernel.execute();
 
-    EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ::testing::ElementsAreArray(ArrayFloatNear(test_outputs[i], 0.0001f)))
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
         << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
@@ -163,8 +144,72 @@ TEST(AddTest, Float)
     kernel.configure();
     kernel.execute();
 
-    EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ::testing::ElementsAreArray(ArrayFloatNear(test_outputs[i], 0.0001f)))
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+        << "With shape number " << i;
+  }
+}
+
+TEST(AddTest, SInt16)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<int32_t>> ref_output_shapes{
+      {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                                 1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  std::vector<std::vector<float>> ref_outputs = {
+      {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
+       1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
+       0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
+      {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
+      {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
+       1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
+       0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
+      {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
+
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data);
+    Tensor input2_tensor =
+        makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
+    Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
+    const float tolerance = output_tensor.scale();
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorShape(output_tensor),
+                ::testing::ElementsAreArray(ref_output_shapes[i]))
+        << "With shape number " << i;
+    EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
+        << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs and different scales.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor =
+        makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data);
+    Tensor output_tensor = makeOutputTensor(DataType::S16, 5.0 / 32767, 0);
+    const float tolerance = output_tensor.scale();
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorShape(output_tensor),
+                ::testing::ElementsAreArray(ref_output_shapes[i]))
+        << "With shape number " << i;
+    EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
         << "With shape number " << i;
   }
 }
diff --git a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
index 2ab7ff0da..c6734a114 100644
--- a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
@@ -32,12 +32,9 @@ void Check(std::initializer_list<int32_t> input_shape,
            std::initializer_list<int32_t> output_shape, std::initializer_list<T1> input_data,
            std::initializer_list<int32_t> dimension_data, std::initializer_list<T2> output_data)
 {
-
-  Tensor input_tensor{getElementType<T1>(), input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(T1));
-  Tensor dimension_tensor{DataType::S32, dimension_shape, {}, ""};
-  dimension_tensor.writeData(dimension_data.begin(), dimension_data.size() * sizeof(int32_t));
-
+  constexpr DataType element_type = getElementType<T1>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor dimension_tensor = makeInputTensor<DataType::S32>(dimension_shape, dimension_data);
   Tensor output_tensor = makeOutputTensor(getElementType<T2>());
 
   ArgMaxParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
index cdd81d7d6..df54f9786 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
@@ -18,6 +18,7 @@
 
 #include "kernels/Utils.h"
 
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
 #include <tensorflow/lite/kernels/internal/reference/pooling.h>
 
 #include <stdexcept>
@@ -61,11 +62,13 @@ void AveragePool2D::configure()
       computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
   if (input()->element_type() == DataType::U8)
   {
-    if (input()->scale() != output()->scale() || input()->zero_point() != output()->zero_point())
-    {
-      throw std::runtime_error(
-          "Quantization param for Input and output must be same(scale or zero-point)");
-    }
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point());
+  }
+  else if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
   }
   output()->resize({batches, output_height, output_width, depth});
 }
@@ -80,6 +83,9 @@ void AveragePool2D::execute() const
     case DataType::U8:
       evalQuantized();
       break;
+    case DataType::S16:
+      evalSInt16();
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
@@ -126,5 +132,26 @@ void AveragePool2D::evalQuantized() const
                                      getTensorData<uint8_t>(output()));
 }
 
+void AveragePool2D::evalSInt16() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_integer_ops::AveragePool(
+      params, getTensorShape(input()), getTensorData<int16_t>(input()), //
+      getTensorShape(output()), getTensorData<int16_t>(output()));
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.h b/compiler/luci-interpreter/src/kernels/AveragePool2D.h
index 91f212b3a..282a58797 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.h
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.h
@@ -39,6 +39,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalSInt16() const;
 
 private:
   int32_t _padding_height{};
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
index cc80e5e90..83e48c89d 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
@@ -53,25 +53,21 @@ TEST(AveragePool2DTest, Float)
       0, 1.5, //
       4.5, 6, //
   };
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
 }
 
 TEST(AveragePool2DTest, Uint8_0)
 {
+  std::vector<float> input_data{
+      0,  -6, 12, 4, //
+      -3, -2, 10, 7, //
+  };
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
-  Tensor input_tensor{DataType::U8, {1, 2, 4, 1}, {{quant_param.first}, {quant_param.second}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
-  std::vector<uint8_t> quant_input = quantize<uint8_t>(
-      {
-          0, -6, 12, 4,  //
-          -3, -2, 10, 7, //
-      },
-      quant_param.first, quant_param.second);
-  input_tensor.writeData(quant_input.data(), quant_input.size() * sizeof(uint8_t));
-
   Pool2DParams params{};
   params.padding = Padding::VALID;
   params.filter_height = 2;
@@ -84,26 +80,22 @@ TEST(AveragePool2DTest, Uint8_0)
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(dequantize(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                         output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear({0.0, 6.0})));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0.0, 6.0}));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 1}));
 }
 
 TEST(AveragePool2DTest, Uint8_1)
 {
+  std::vector<float> input_data{
+      0, 6, 12, 4, //
+      3, 2, 10, 7, //
+  };
+
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
-  Tensor input_tensor{DataType::U8, {1, 2, 4, 1}, {{quant_param.first}, {quant_param.second}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
-  std::vector<uint8_t> quant_input = quantize<uint8_t>(
-      {
-          0, 6, 12, 4, //
-          3, 2, 10, 7, //
-      },
-      quant_param.first, quant_param.second);
-  input_tensor.writeData(quant_input.data(), quant_input.size() * sizeof(uint8_t));
-
   Pool2DParams params{};
   params.padding = Padding::VALID;
   params.filter_height = 2;
@@ -116,12 +108,42 @@ TEST(AveragePool2DTest, Uint8_1)
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(dequantize(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                         output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear({2.75, 6.0})));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({2.75, 6.0}));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 1}));
 }
 
+TEST(AveragePool2DTest, SInt16)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
+  std::vector<float> input_data{
+      -4, -3, -2, -1, 0,  //
+      1,  2,  3,  4,  5,  //
+      6,  7,  8,  9,  10, //
+  };
+  std::vector<float> ref_output_data{
+      0, 1.5, //
+      4.5, 6, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.5, 0, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
 TEST(AveragePool2DTest, Invalid_Input_Shape_NEG)
 {
   Shape input_shape{1, 3, 5};
@@ -170,20 +192,17 @@ TEST(AveragePool2DTest, In_Out_Type_NEG)
 
 TEST(AveragePool2DTest, Quant_Param_NEG)
 {
+  std::vector<float> input_data{
+      0,  -6, 12, 4, //
+      -3, -2, 10, 7, //
+  };
+
   std::pair<float, int32_t> quant_param1 = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
   std::pair<float, int32_t> quant_param2 = quantizationParams<uint8_t>(-7.875f, 7.875f);
-  Tensor input_tensor{
-      DataType::U8, {1, 2, 4, 1}, {{quant_param1.first}, {quant_param1.second}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param1.first,
+                                                      quant_param1.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param2.first, quant_param2.second);
 
-  std::vector<uint8_t> quant_input = quantize<uint8_t>(
-      {
-          0, -6, 12, 4,  //
-          -3, -2, 10, 7, //
-      },
-      quant_param1.first, quant_param1.second);
-  input_tensor.writeData(quant_input.data(), quant_input.size() * sizeof(uint8_t));
-
   Pool2DParams params{};
   params.padding = Padding::VALID;
   params.filter_height = 2;
diff --git a/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h b/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h
new file mode 100644
index 000000000..62bd4158e
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
+#define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+// Derived from tensorflow/lite/kernels/internal/reference/maximum_minimum.h (v2.3.0).
+template <typename T, typename Op, int N = 5>
+void BinaryOpBroadcastSlow(const tflite::RuntimeShape &unextended_input1_shape,
+                           const T *input1_data,
+                           const tflite::RuntimeShape &unextended_input2_shape,
+                           const T *input2_data,
+                           const tflite::RuntimeShape &unextended_output_shape, T *output_data,
+                           Op op)
+{
+  if (unextended_input1_shape == unextended_input2_shape)
+  {
+    const int flat_size = tflite::MatchingElementsSize(
+        unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
+    for (int i = 0; i < flat_size; ++i)
+    {
+      output_data[i] = op(input1_data[i], input2_data[i]);
+    }
+  }
+  else
+  {
+    assert(unextended_input1_shape.DimensionsCount() <= N);
+    assert(unextended_input2_shape.DimensionsCount() <= N);
+    assert(unextended_output_shape.DimensionsCount() <= N);
+
+    tflite::NdArrayDesc<N> desc1{};
+    tflite::NdArrayDesc<N> desc2{};
+    tflite::NdArrayDesc<N> output_desc{};
+    tflite::NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape,
+                                                &desc1, &desc2);
+    tflite::CopyDimsToDesc(tflite::RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                           &output_desc);
+
+    auto fn = [&](int indexes[N]) {
+      output_data[SubscriptToIndex(output_desc, indexes)] =
+          op(input1_data[SubscriptToIndex(desc1, indexes)],
+             input2_data[SubscriptToIndex(desc2, indexes)]);
+    };
+    tflite::NDOpsHelper<N>(output_desc, fn);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
index 040ac5911..b460321bd 100644
--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
@@ -16,10 +16,22 @@ set(SOURCES
     DepthToSpace.cpp
     DepthwiseConv2D.h
     DepthwiseConv2D.cpp
+    Div.h
+    Div.cpp
     Elu.h
     Elu.cpp
+    Floor.h
+    Floor.cpp
+    FloorDiv.h
+    FloorDiv.cpp
+    Equal.h
+    Equal.cpp
     FullyConnected.h
     FullyConnected.cpp
+    Greater.h
+    Greater.cpp
+    GreaterEqual.h
+    GreaterEqual.cpp
     If.h
     If.cpp
     L2Normalize.h
@@ -28,20 +40,44 @@ set(SOURCES
     L2Pool2D.cpp
     LeakyRelu.h
     LeakyRelu.cpp
+    Less.h
+    Less.cpp
+    LessEqual.h
+    LessEqual.cpp
     LocalResponseNormalization.h
     LocalResponseNormalization.cpp
     Logistic.h
     Logistic.cpp
+    LogSoftmax.h
+    LogSoftmax.cpp
+    Maximum.h
+    Maximum.cpp
     MaxPool2D.h
     MaxPool2D.cpp
     Mean.h
     Mean.cpp
+    Minimum.h
+    Minimum.cpp
     Mul.h
     Mul.cpp
+    NotEqual.h
+    NotEqual.cpp
     Pad.h
     Pad.cpp
+    Pow.h
+    Pow.cpp
+    Prelu.h
+    Prelu.cpp
+    Relu.h
+    Relu.cpp
+    Relu6.h
+    Relu6.cpp
     Reshape.h
     Reshape.cpp
+    ResizeBilinear.h
+    ResizeBilinear.cpp
+    ResizeNearestNeighbor.h
+    ResizeNearestNeighbor.cpp
     Reverse.h
     Reverse.cpp
     Rsqrt.h
@@ -60,6 +96,8 @@ set(SOURCES
     Sqrt.cpp
     Squeeze.h
     Squeeze.cpp
+    Sub.h
+    Sub.cpp
     Tanh.h
     Tanh.cpp
     Transpose.h
@@ -69,7 +107,11 @@ set(SOURCES
     Unpack.h
     Unpack.cpp)
 
-list(APPEND SOURCES Utils.h Utils.cpp ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc)
+list(APPEND SOURCES
+    BinaryOpCommon.h
+    Utils.h
+    Utils.cpp
+    ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc)
 
 add_library(luci_interpreter_kernels STATIC ${SOURCES})
 set_target_properties(luci_interpreter_kernels PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -92,19 +134,37 @@ set(TEST_SOURCES
     Conv2D.test.cpp
     DepthToSpace.test.cpp
     DepthwiseConv2D.test.cpp
+    Div.test.cpp
     Elu.test.cpp
+    Floor.test.cpp
+    FloorDiv.test.cpp
+    Equal.test.cpp
     FullyConnected.test.cpp
+    Greater.test.cpp
+    GreaterEqual.test.cpp
     If.test.cpp
     L2Normalize.test.cpp
     L2Pool2D.test.cpp
     LeakyRelu.test.cpp
+    Less.test.cpp
+    LessEqual.test.cpp
     LocalResponseNormalization.test.cpp
     Logistic.test.cpp
+    LogSoftmax.test.cpp
+    Maximum.test.cpp
     MaxPool2D.test.cpp
     Mean.test.cpp
+    Minimum.test.cpp
     Mul.test.cpp
+    NotEqual.test.cpp
     Pad.test.cpp
+    Pow.test.cpp
+    Prelu.test.cpp
+    Relu.test.cpp
+    Relu6.test.cpp
     Reshape.test.cpp
+    ResizeBilinear.test.cpp
+    ResizeNearestNeighbor.test.cpp
     Reverse.test.cpp
     Rsqrt.test.cpp
     Slice.test.cpp
@@ -114,6 +174,7 @@ set(TEST_SOURCES
     StridedSlice.test.cpp
     Sqrt.test.cpp
     Squeeze.test.cpp
+    Sub.test.cpp
     Tanh.test.cpp
     Transpose.test.cpp
     TransposeConv.test.cpp
diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.cpp
index 812ab7609..6f8820446 100644
--- a/compiler/luci-interpreter/src/kernels/Concatenation.cpp
+++ b/compiler/luci-interpreter/src/kernels/Concatenation.cpp
@@ -36,20 +36,20 @@ Concatenation::Concatenation(std::vector<const Tensor *> inputs, Tensor *output,
 void Concatenation::configure()
 {
   const int num_inputs = _inputs.size();
-  assert(num_inputs > 0);
+  LUCI_INTERPRETER_CHECK(num_inputs > 0);
   const Tensor *t0 = _inputs[0];
 
   int axis = _params.axis;
   if (axis < 0)
     axis += t0->shape().num_dims();
-  assert(axis >= 0 && axis < t0->shape().num_dims());
+  LUCI_INTERPRETER_CHECK(axis >= 0 && axis < t0->shape().num_dims());
 
   int32_t sum_axis = t0->shape().dim(axis);
   for (int i = 1; i < num_inputs; ++i)
   {
     const Tensor *tensor = _inputs[i];
-    assert(tensor->element_type() == t0->element_type());
-    assert(tensor->shape().num_dims() == t0->shape().num_dims());
+    LUCI_INTERPRETER_CHECK(tensor->element_type() == t0->element_type());
+    LUCI_INTERPRETER_CHECK(tensor->shape().num_dims() == t0->shape().num_dims());
     for (int d = 0; d < t0->shape().num_dims(); ++d)
     {
       if (d == axis)
@@ -58,7 +58,7 @@ void Concatenation::configure()
       }
       else
       {
-        assert(tensor->shape().dim(d) == t0->shape().dim(d));
+        LUCI_INTERPRETER_CHECK(tensor->shape().dim(d) == t0->shape().dim(d));
       }
     }
   }
diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
index d9a7097d0..91707a256 100644
--- a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
@@ -44,7 +44,7 @@ TEST(ConcatenationTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ElementsAreArray(ArrayFloatNear({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12})));
+                FloatArrayNear({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
   }
   {
     params.axis = -2; // Same as '0'.
@@ -54,7 +54,7 @@ TEST(ConcatenationTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ElementsAreArray(ArrayFloatNear({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12})));
+                FloatArrayNear({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
   }
   {
     params.axis = 1;
@@ -64,7 +64,7 @@ TEST(ConcatenationTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ElementsAreArray(ArrayFloatNear({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12})));
+                FloatArrayNear({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
   }
   {
     params.axis = -1; // Same as '1'.
@@ -74,10 +74,96 @@ TEST(ConcatenationTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ElementsAreArray(ArrayFloatNear({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12})));
+                FloatArrayNear({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
   }
 }
 
+TEST(ConcatenationTest, Input_Number_Check_NEG)
+{
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+
+  Concatenation kernel({}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ConcatenationTest, Invalid_Axis_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -3;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ConcatenationTest, Mismatching_Input_Type_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<uint8_t> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::U8>({2, 3}, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ConcatenationTest, Mismatching_Input_Dimension_Num_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 3}, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ConcatenationTest, Mismatching_Input_Dimension_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12, 13, 14, 15};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ConcatenationTest, Unsupported_Configure_Type_NEG)
+{
+  std::vector<int8_t> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<int8_t> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor = makeInputTensor<DataType::S8>({2, 3}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::S8>({2, 3}, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S8);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
index 0446d9760..be8364528 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
@@ -66,8 +66,7 @@ TEST(Conv2DTest, Float)
       0,  40, 0, 44, // row = 1
   };
   std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
@@ -114,46 +113,38 @@ TEST(Conv2DTest, FloatCheck)
       37, 4, 3, // second batch, right
   };
   std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
 TEST(Conv2DTest, Uint8)
 {
+  std::vector<float> input_data{
+      // First batch
+      1, 1, 1, 1, // row = 1
+      2, 2, 2, 2, // row = 2
+                  // Second batch
+      1, 2, 3, 4, // row = 1
+      1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+      1,  2,  3,  4, // first 2x2 filter
+      -1, 1,  -1, 1, // second 2x2 filter
+      -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
   std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
-  Shape bias_shape = {3};
-  Tensor input_tensor{
-      DataType::U8, {2, 2, 4, 1}, {{input_quant_param.first}, {input_quant_param.second}}, ""};
-  Tensor filter_tensor{
-      DataType::U8, {3, 2, 2, 1}, {{input_quant_param.first}, {input_quant_param.second}}, ""};
-  Tensor bias_tensor{
-      DataType::S32, bias_shape, {{input_quant_param.first * input_quant_param.first}, {0}}, ""};
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::U8>({3, 2, 2, 1}, input_quant_param.first,
+                                                       input_quant_param.second, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(
+      {3}, input_quant_param.first * input_quant_param.first, 0, bias_data);
   Tensor output_tensor =
       makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
-  std::vector<uint8_t> quantized_input = quantize<uint8_t>(
-      {
-          // First batch
-          1, 1, 1, 1, // row = 1
-          2, 2, 2, 2, // row = 2
-          // Second batch
-          1, 2, 3, 4, // row = 1
-          1, 2, 3, 4, // row = 2
-      },
-      input_quant_param.first, input_quant_param.second);
-  std::vector<uint8_t> quantized_filter = quantize<uint8_t>(
-      {
-          1, 2, 3, 4,   // first 2x2 filter
-          -1, 1, -1, 1, // second 2x2 filter
-          -1, -1, 1, 1, // third 2x2 filter
-      },
-      input_quant_param.first, input_quant_param.second);
-  std::vector<int32_t> bias_data =
-      quantize<int32_t>({1, 2, 3}, input_quant_param.first * input_quant_param.first, 0);
-  input_tensor.writeData(quantized_input.data(), quantized_input.size() * sizeof(uint8_t));
-  filter_tensor.writeData(quantized_filter.data(), quantized_filter.size() * sizeof(uint8_t));
-  bias_tensor.writeData(bias_data.data(), bias_data.size() * sizeof(int32_t));
 
   Conv2DParams params{};
   params.padding = Padding::VALID;
@@ -174,9 +165,7 @@ TEST(Conv2DTest, Uint8)
       37, 4, 3, // second batch, right
   };
   std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
-  EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor),
-                                  output_quant_param.first, output_quant_param.second),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
index cab63e26d..57238313c 100644
--- a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -30,20 +30,10 @@ DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpa
 
 void DepthToSpace::configure()
 {
-  if (input()->shape().num_dims() != 4)
-  {
-    throw std::runtime_error("Invalid input num_dims.");
-  }
-  if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 &&
-      output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 &&
-      output()->element_type() != DataType::S64)
-  {
-    throw std::runtime_error("Invalid output type");
-  }
-  if (input()->element_type() != output()->element_type())
-  {
-    throw std::runtime_error("Type mismatch on input and output.");
-  }
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32 ||
+                         output()->element_type() == DataType::U8)
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type())
   const int block_size = params().block_size;
   const int32_t input_height = input()->shape().dim(1);
   const int32_t input_width = input()->shape().dim(2);
@@ -52,9 +42,9 @@ void DepthToSpace::configure()
   int32_t output_width = input_width * block_size;
   int32_t output_channels = input_channels / block_size / block_size;
 
-  assert(input_height == output_height / block_size);
-  assert(input_width == output_width / block_size);
-  assert(input_channels == output_channels * block_size * block_size);
+  LUCI_INTERPRETER_CHECK(input_height == output_height / block_size);
+  LUCI_INTERPRETER_CHECK(input_width == output_width / block_size);
+  LUCI_INTERPRETER_CHECK(input_channels == output_channels * block_size * block_size);
 
   Shape output_shape(4);
   output_shape.dim(0) = input()->shape().dim(0);
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
index 1b805702d..3dee4ad36 100644
--- a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
@@ -55,6 +55,51 @@ TYPED_TEST(DepthToSpaceTest, SimpleCase)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
+TEST(DepthToSpaceTest, InvalidInputShape_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 2, 4};
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthToSpaceTest, InOutTypeMismatch_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthToSpaceTest, InvalidBlockSize_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthToSpaceParams params{};
+  params.block_size = 3;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
index b01a5e086..99d52715b 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
@@ -48,33 +48,33 @@ void DepthwiseConv2D::configure()
   // We only support (1) and (3) for now.
   if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
   {
-    assert(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
   }
   else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
   {
-    assert(bias() == nullptr || bias()->element_type() == DataType::S32);
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
   }
   else
   {
     throw std::runtime_error("Unsupported type.");
   }
-  assert(output()->element_type() == input()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
 
   const Shape &input_shape = input()->shape();
   const Shape &filter_shape = filter()->shape();
-  assert(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
 
   const int32_t batches = input_shape.dim(0);
   const int32_t input_height = input_shape.dim(1);
   const int32_t input_width = input_shape.dim(2);
   // Filter format: [1, H, W, O].
-  assert(filter_shape.dim(0) == 1);
+  LUCI_INTERPRETER_CHECK(filter_shape.dim(0) == 1);
   const int32_t filter_height = filter_shape.dim(1);
   const int32_t filter_width = filter_shape.dim(2);
   const int32_t channels_out = filter_shape.dim(3);
 
-  assert(bias() == nullptr ||
-         (bias()->shape().num_dims() == 1 && bias()->shape().dim(0) == channels_out));
+  LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
+                                               bias()->shape().dim(0) == channels_out));
 
   const int32_t output_height =
       computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
index a9b43d864..a5128289f 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
@@ -66,47 +66,37 @@ TEST(DepthwiseConv2DTest, Float)
       71,  0, 99,  0,  //
       167, 0, 227, 28, //
   };
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
 }
 
 TEST(DepthwiseConv2DTest, Uint8)
 {
+  std::vector<float> input_data{
+      1, 2, 7,  8,  // column 1
+      3, 4, 9,  10, // column 2
+      5, 6, 11, 12, // column 3
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
   std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
 
-  Tensor input_tensor{
-      DataType::U8, {1, 3, 2, 2}, {{input_quant_param.first}, {input_quant_param.second}}, ""};
-  Tensor filter_tensor{
-      DataType::U8, {1, 2, 2, 4}, {{input_quant_param.first}, {input_quant_param.second}}, ""};
-  Tensor bias_tensor{
-      DataType::S32, {4}, {{input_quant_param.first * input_quant_param.first}, {0}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 3, 2, 2}, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 4}, input_quant_param.first,
+                                                       input_quant_param.second, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(
+      {4}, input_quant_param.first * input_quant_param.first, 0, bias_data);
   Tensor output_tensor =
       makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
-  std::vector<uint8_t> quant_input = quantize<uint8_t>(
-      {
-          1, 2, 7, 8,   // column 1
-          3, 4, 9, 10,  // column 2
-          5, 6, 11, 12, // column 3
-      },
-      input_quant_param.first, input_quant_param.second);
-  std::vector<uint8_t> quant_filter = quantize<uint8_t>(
-      {
-          1, 2, 3, 4,       //
-          -9, 10, -11, 12,  //
-          5, 6, 7, 8,       //
-          13, -14, 15, -16, //
-      },
-      input_quant_param.first, input_quant_param.second);
-  std::vector<int32_t> quant_bias =
-      quantize<int32_t>({1, 2, 3, 4}, input_quant_param.first * input_quant_param.first, 0);
-
-  input_tensor.writeData(quant_input.data(), quant_input.size() * sizeof(uint8_t));
-  filter_tensor.writeData(quant_filter.data(), quant_filter.size() * sizeof(uint8_t));
-  bias_tensor.writeData(quant_bias.data(), quant_bias.size() * sizeof(int32_t));
-
   DepthwiseConv2DParams params{};
   params.padding = Padding::VALID;
   params.depth_multiplier = 2;
@@ -124,12 +114,190 @@ TEST(DepthwiseConv2DTest, Uint8)
       71, -34, 99,  -20, //
       91, -26, 127, -4,  //
   };
-  EXPECT_THAT(dequantize(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                         output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
 }
 
+TEST(DepthwiseConv2DTest, InvalidBiasType_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+      1,  2,  7,  8,  //
+      3,  4,  9,  10, //
+      5,  6,  11, 12, //
+      13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<int32_t> bias_data{1, 2, 3, 4};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+      1,  2,  7,  8,  //
+      3,  4,  9,  10, //
+      5,  6,  11, 12, //
+      13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthwiseConv2DTest, InvalidInputShape_NEG)
+{
+  Shape input_shape{4, 2, 2};
+  Shape filter_shape{2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+      1,  2,  7,  8,  //
+      3,  4,  9,  10, //
+      5,  6,  11, 12, //
+      13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthwiseConv2DTest, InvalidFilterShape_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{2, 1, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+      1,  2,  7,  8,  //
+      3,  4,  9,  10, //
+      5,  6,  11, 12, //
+      13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthwiseConv2DTest, InvalidBiasDim_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 4, 2};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+      1,  2,  7,  8,  //
+      3,  4,  9,  10, //
+      5,  6,  11, 12, //
+      13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+      1,  2,   3,   4,   //
+      -9, 10,  -11, 12,  //
+      5,  6,   7,   8,   //
+      13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
+  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Div.cpp b/compiler/luci-interpreter/src/kernels/Div.cpp
new file mode 100644
index 000000000..e75876b3a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Div.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Div.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Div::Div(const Tensor *input1, const Tensor *input2, Tensor *output, const DivParams &params)
+    : KernelWithParams<DivParams>({input1, input2}, {output}, params)
+{
+}
+
+void Div::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
+
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Div::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Div::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+      getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastDivSlow(
+        params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+        getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Div(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                               getTensorShape(input2()), getTensorData<float>(input2()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+  }
+}
+
+void Div::evalQuantized() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const double real_output_multiplier = input1_scale / (input2_scale * output_scale);
+
+  int32_t output_multiplier{};
+  int output_shift{};
+
+  quantizeMultiplier(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+
+  params.input1_offset = -input1()->zero_point(); // Note the '-'.
+  params.input2_offset = -input2()->zero_point(); // Note the '-'.
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+      getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastDivSlow(
+        params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+        getTensorShape(input2()), getTensorData<uint8_t>(input2()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Div(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+                               getTensorShape(input2()), getTensorData<uint8_t>(input2()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Div.h b/compiler/luci-interpreter/src/kernels/Div.h
new file mode 100644
index 000000000..6040cdd02
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Div.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DIV_H
+#define LUCI_INTERPRETER_KERNELS_DIV_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Div : public KernelWithParams<DivParams>
+{
+public:
+  Div(const Tensor *input1, const Tensor *input2, Tensor *output, const DivParams &params);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DIV_H
diff --git a/compiler/luci-interpreter/src/kernels/Div.test.cpp b/compiler/luci-interpreter/src/kernels/Div.test.cpp
new file mode 100644
index 000000000..77eb2e9c1
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Div.test.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Div.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+float GetTolerance(float min, float max)
+{
+  const float kQuantizedStep = (max - min) / 255.0f;
+  const float kQuantizedTolerance = 2.0f * kQuantizedStep + kQuantizedStep * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(DivTest, Float)
+{
+  Shape base_shape = {2, 3, 1, 1};
+
+  std::vector<int32_t> output_shape = {2, 3, 1, 1};
+
+  std::vector<float> input1_data{0.3f, 2.3f, 0.9f, 0.5f, 0.8f, 1.1f};
+  std::vector<float> input2_data{0.2f, 1.6f, 0.5f, 0.4f, 1.6f, 0.4f};
+  std::vector<float> test_outputs{1.5f, 1.4375f, 1.8f, 1.25f, 0.5f, 2.75f};
+
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input2_data);
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(DivTest, FloatBroadcast)
+{
+  Shape input1_shape = {1, 3};
+  Shape input2_shape = {3, 1};
+
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f};
+  std::vector<float> input2_data{0.2f, 1.6f, 0.5f};
+  std::vector<float> test_outputs{0.f, 11.5f, 4.5f, 0.f, 1.4375f, 0.5625f, 0.f, 4.6f, 1.8f};
+
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data);
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+}
+
+TEST(DivTest, Uint8)
+{
+  Shape base_shape = {1, 2, 2, 1};
+
+  std::vector<int32_t> output_shape = {1, 2, 2, 1};
+
+  std::vector<float> input1_data = {-0.8f, -0.2f, 0.3f, 0.7f};
+  std::vector<float> input2_data = {-0.8f, 0.4f, 0.8f, 1.0f};
+  std::vector<float> test_outputs{1.0f, 0.f, 0.375f, 0.7f};
+
+  const float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.f, 1.f);
+
+  Tensor input1_tensor =
+      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input1_data);
+  Tensor input2_tensor =
+      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input2_data);
+
+  Tensor output_tensor =
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(test_outputs, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(DivTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DivTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2});
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Elu.cpp b/compiler/luci-interpreter/src/kernels/Elu.cpp
index 5de4a1f3b..456396055 100644
--- a/compiler/luci-interpreter/src/kernels/Elu.cpp
+++ b/compiler/luci-interpreter/src/kernels/Elu.cpp
@@ -31,7 +31,7 @@ Elu::Elu(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
 
 void Elu::configure()
 {
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   output()->resize(input()->shape());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Elu.test.cpp b/compiler/luci-interpreter/src/kernels/Elu.test.cpp
index 52444cbea..0235d6552 100644
--- a/compiler/luci-interpreter/src/kernels/Elu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Elu.test.cpp
@@ -29,9 +29,7 @@ using namespace testing;
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor{DataType::FLOAT32, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(float));
-
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Elu kernel(&input_tensor, &output_tensor);
@@ -39,8 +37,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
   kernel.execute();
 
   (void)output_shape;
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ::testing::ElementsAreArray(ArrayFloatNear(output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
 }
 
 TEST(EluTest, SimpleElu)
@@ -59,6 +56,20 @@ TEST(EluTest, SimpleElu)
       });
 }
 
+TEST(EluTest, InOutTypeMismatch_NEG)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+      0, -6, 2,  -4,   //
+      3, -2, 10, -0.1, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Elu kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Equal.cpp b/compiler/luci-interpreter/src/kernels/Equal.cpp
new file mode 100644
index 000000000..f58de1250
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Equal.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Equal.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Equal::Equal(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void Equal::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void Equal::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Equal::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowEqual(op_params, getTensorShape(x()), x_data,
+                                                getTensorShape(y()), y_data,
+                                                getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::Equal(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                 y_data, getTensorShape(output()), output_data);
+  }
+}
+
+void Equal::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                                           getTensorShape(y()), y_data,
+                                                           getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::EqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                            getTensorShape(y()), y_data, getTensorShape(output()),
+                                            output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Equal.h b/compiler/luci-interpreter/src/kernels/Equal.h
new file mode 100644
index 000000000..69b3be774
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Equal.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Equal : public Kernel
+{
+public:
+  Equal(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int32_t _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int32_t _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_EQUAL_H
diff --git a/compiler/luci-interpreter/src/kernels/Equal.test.cpp b/compiler/luci-interpreter/src/kernels/Equal.test.cpp
new file mode 100644
index 000000000..fb0de8bbf
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Equal.test.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Equal.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(EqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+      -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true, false, // Row 1
+      false, true, false, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(EqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+      -1,  0,   1,   // Row 3
+      0.9, 0.7, 0.5, // Row 4
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true,  false, // Row 1
+      false, false, false, // Row 2
+      false, false, false, // Row 3
+      true,  true,  true,  // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST(EqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+      0.5, 0.5, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.5, 0.55, 0.5, // Row 1
+      -1,  0,   0.05, 1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true, false, false, // Row 1
+      false, true, true,  false, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
+                                                  x_quant_param.second, x_data);
+
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
+                                                  y_quant_param.second, y_data);
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(EqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+      0.4,  -0.8, 0.7,  0.3, // Row 1
+      -0.5, 0.1,  0,    0.5, // Row 2
+      1,    0,    0.05, -1,  // Row 3
+      -1,   0.05, 0,    1,   // Row 4
+  };
+
+  std::vector<float> y_data{
+      -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      false, false, false, false, // Row 1
+      false, false, true,  false, // Row 2
+      false, false, false, false, // Row 3
+      true,  true,  true,  true,  // Row 4
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(EqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(EqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Floor.cpp b/compiler/luci-interpreter/src/kernels/Floor.cpp
new file mode 100644
index 000000000..e3c4246cc
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Floor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Floor.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/floor.h>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Floor::Floor(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Floor::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  output()->resize(input()->shape());
+}
+
+void Floor::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Floor::evalFloat() const
+{
+  tflite::reference_ops::Floor(getTensorShape(input()), getTensorData<float>(input()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Floor.h b/compiler/luci-interpreter/src/kernels/Floor.h
new file mode 100644
index 000000000..ca3ad5997
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Floor.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FLOOR_H
+#define LUCI_INTERPRETER_KERNELS_FLOOR_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Floor : public Kernel
+{
+public:
+  Floor(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FLOOR_H
diff --git a/compiler/luci-interpreter/src/kernels/Floor.test.cpp b/compiler/luci-interpreter/src/kernels/Floor.test.cpp
new file mode 100644
index 000000000..3e1ab6f3a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Floor.test.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Floor.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(FloorTest, SimpleFloat)
+{
+  std::initializer_list<int32_t> input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+      0.2, 8.6, 2.4,  4.3,  // Row 1
+      3,   7.1, 10.5, -0.9, // Row 2
+  };
+
+  std::initializer_list<int32_t> ref_output_shape{1, 2, 4, 1};
+  std::vector<float> ref_output_data{
+      0, 8, 2,  4,  // Row 1
+      3, 7, 10, -1, // Row 2
+  };
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Floor kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST(FloorTest, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Floor kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/FloorDiv.cpp b/compiler/luci-interpreter/src/kernels/FloorDiv.cpp
new file mode 100644
index 000000000..b6f36cea3
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/FloorDiv.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/FloorDiv.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/binary_function.h>
+#include <cmath>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+FloorDiv::FloorDiv(const Tensor *input, const Tensor *alpha, Tensor *output)
+    : Kernel({input, alpha}, {output})
+{
+}
+
+void FloorDiv::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(y()->element_type() == output()->element_type());
+
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void FloorDiv::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void FloorDiv::evalFloat() const
+{
+  auto FloorDivFunc = [](float x, float y) -> float {
+    return std::floor(static_cast<double>(x) / static_cast<double>(y));
+  };
+
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+
+  // Check the denominator
+  for (int i = 0; i < getTensorShape(y()).FlatSize(); ++i)
+  {
+    LUCI_INTERPRETER_CHECK(y_data[i] != 0);
+  }
+
+  if (x()->shape() != y()->shape())
+  {
+    tflite::reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+        getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+        getTensorData<float>(output()), FloorDivFunc);
+  }
+  else
+  {
+    tflite::reference_ops::BinaryFunction<float, float, float>(
+        getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+        getTensorData<float>(output()), FloorDivFunc);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/FloorDiv.h b/compiler/luci-interpreter/src/kernels/FloorDiv.h
new file mode 100644
index 000000000..e9c47d81a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/FloorDiv.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FLOOR_DIV_H
+#define LUCI_INTERPRETER_KERNELS_FLOOR_DIV_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class FloorDiv : public Kernel
+{
+public:
+  FloorDiv(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FLOOR_DIV_H
diff --git a/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp b/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
new file mode 100644
index 000000000..a5bc700f7
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/FloorDiv.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(FloorDivTest, FloatSimple)
+{
+  Shape x_shape{2, 3};
+  std::vector<float> x_data{
+      0.5, 2.4,  3.1,  // Row 1
+      1.9, -1.9, -2.8, // Row 2
+  };
+
+  Shape y_shape = x_shape;
+  std::vector<float> y_data{
+      2.0, 0.5,  3.0,  // Row 1
+      1.0, -1.0, -2.0, // Row 2
+  };
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  std::vector<float> ref_output_data{
+      0, 4, 1, // Row 1
+      1, 1, 1, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST(FloorDivTest, FloatBroadcast)
+{
+  Shape x_shape{1, 3};
+  std::vector<float> x_data{
+      0.5, 2.4, -3.1, // Row 1
+  };
+
+  Shape y_shape{3, 3};
+  std::vector<float> y_data{
+      1.0, 1.0,  1.0,  // Row 1
+      2.0, -0.5, -2.0, // Row 2
+      0.3, 0.7,  0.9,  // Row 3
+  };
+
+  std::vector<int32_t> ref_output_shape{3, 3};
+  std::vector<float> ref_output_data{
+      0, 2,  -4, // Row 1
+      0, -5, 1,  // Row 2
+      1, 3,  -4, // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST(FloorDivTest, DivByZero_NEG)
+{
+  Shape shape{3};
+  std::vector<float> x_data{1, 0, -1};
+  std::vector<float> y_data{0, 0, 0};
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(shape, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(shape, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST(FloorDivTest, Input_Output_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(FloorDivTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
index 6529c5e77..7fa76d5e7 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
@@ -36,27 +36,54 @@ FullyConnected::FullyConnected(const Tensor *input, const Tensor *weights, const
 
 void FullyConnected::configure()
 {
-  if (weights()->element_type() != DataType::FLOAT32)
+  if (weights()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::U8);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::U8);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::S32)
+  }
+  else if (weights()->element_type() == DataType::FLOAT32)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
+  }
+  else
+  {
     throw std::runtime_error("Unsupported type.");
-
-  assert(input()->element_type() == DataType::FLOAT32);
-  assert(weights()->element_type() == DataType::FLOAT32);
-  assert(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
+  }
 
   const Shape &input_shape = input()->shape();
   const Shape &weights_shape = weights()->shape();
 
-  assert(weights_shape.num_dims() == 2);
-  assert(bias() == nullptr || bias()->shape().num_elements() == weights_shape.dim(0));
+  LUCI_INTERPRETER_CHECK(weights_shape.num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(bias() == nullptr ||
+                         bias()->shape().num_elements() == weights_shape.dim(0));
 
-  assert(input_shape.num_elements() % weights_shape.dim(1) == 0);
+  LUCI_INTERPRETER_CHECK(input_shape.num_elements() % weights_shape.dim(1) == 0);
   const int32_t batch_size = input_shape.num_elements() / weights_shape.dim(1);
   const int32_t num_units = weights_shape.dim(0);
 
+  if (bias())
+    LUCI_INTERPRETER_CHECK(bias()->shape().num_elements() == weights()->shape().dim(0));
+
   output()->resize({batch_size, num_units});
 }
 
-void FullyConnected::execute() const { evalFloat(); }
+void FullyConnected::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
 
 void FullyConnected::evalFloat() const
 {
@@ -75,5 +102,38 @@ void FullyConnected::evalFloat() const
       getTensorShape(output()), getTensorData<float>(output()));
 }
 
+void FullyConnected::evalQuantized() const
+{
+  double real_multiplier = 0.0;
+  int output_shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  int32_t output_multiplier;
+  real_multiplier =
+      getQuantizedConvolutionMultipler(input()->scale(), weights()->scale(), output()->scale());
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+  calculateActivationRangeQuantized(params().activation, output(), &output_activation_min,
+                                    &output_activation_max);
+
+  int32_t input_offset = -input()->zero_point();
+  int32_t filter_offset = -weights()->zero_point();
+  int32_t output_offset = output()->zero_point();
+
+  tflite::FullyConnectedParams op_params{};
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.lhs_cacheable = false;
+  op_params.rhs_cacheable = false;
+  tflite::reference_ops::FullyConnected(
+      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+      getTensorShape(weights()), getTensorData<uint8_t>(weights()), getTensorShape(bias()),
+      getTensorData<int32_t>(bias()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.h b/compiler/luci-interpreter/src/kernels/FullyConnected.h
index 2e3174c74..204f11ebb 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.h
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.h
@@ -41,6 +41,7 @@ public:
 
 private:
   void evalFloat() const;
+  void evalQuantized() const;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
index 8077fcb5c..d194ce1a0 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
@@ -26,7 +26,85 @@ namespace
 
 using namespace testing;
 
-TEST(FullyConnectedTest, Float)
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
+           std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
+           std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
+  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+}
+
+template <>
+void Check<uint8_t>(
+    std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
+    std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
+    std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
+    std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+{
+  const float quantized_tolerance = getTolerance(-127, 128, 255);
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor weights_tensor = makeInputTensor<DataType::U8>(weights_shape, input_quant_param.first,
+                                                        input_quant_param.second, weights_data);
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(
+      bias_shape, input_quant_param.first * input_quant_param.first, 0, bias_data);
+  Tensor output_tensor =
+      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+template <typename T> class FullyConnectedTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(FullyConnectedTest, DataTypes);
+
+TYPED_TEST(FullyConnectedTest, Simple)
+{
+  Check<TypeParam>({3, 2, 2, 1}, {3, 6}, {3}, {2, 3},
+                   {
+                       -3, -5, 5, 4, 9, -2,  // batch = 0
+                       -3, -2, -4, 9, -8, 1, // batch = 1
+                   },
+                   {
+                       -3, -7, 4, -4, -6, 4, // unit = 0
+                       3, 5, 2, 3, -3, -8,   // unit = 1
+                       -3, 7, 4, 9, 0, -5,   // unit = 2
+                   },
+                   {-1, -5, -8}, {
+                                     0, 0, 32,   // batch = 0
+                                     22, 11, 47, // batch = 1
+                                 });
+}
+
+TEST(FullyConnectedTest, InvalidBiasType_NEG)
 {
   Shape input_shape{3, 2, 2, 1};
   std::vector<float> input_data{
@@ -40,6 +118,34 @@ TEST(FullyConnectedTest, Float)
       -3, 7,  4, 9,  0,  -5, // unit = 2
   };
   Shape bias_shape{3};
+  std::vector<int32_t> bias_data{-1, -5, -8};
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(FullyConnectedTest, InvalidWeightShapeDim_NEG)
+{
+  Shape input_shape{3, 2, 2, 1};
+  std::vector<float> input_data{
+      -3, -5, 5,  4, 9,  -2, // batch = 0
+      -3, -2, -4, 9, -8, 1,  // batch = 1
+  };
+  Shape weights_shape{1, 3, 6};
+  std::vector<float> weights_data{
+      -3, -7, 4, -4, -6, 4,  // unit = 0
+      3,  5,  2, 3,  -3, -8, // unit = 1
+      -3, 7,  4, 9,  0,  -5, // unit = 2
+  };
+  Shape bias_shape{3};
   std::vector<float> bias_data{-1, -5, -8};
 
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -51,15 +157,38 @@ TEST(FullyConnectedTest, Float)
   params.activation = Activation::RELU;
 
   FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
-  kernel.configure();
-  kernel.execute();
+  EXPECT_ANY_THROW(kernel.configure());
+}
 
-  std::vector<float> ref_output_data{
-      0,  0,  32, // batch = 0
-      22, 11, 47, // batch = 1
+TEST(FullyConnectedTest, BiasElementNumWeightDimMismatch_NEG)
+{
+  Shape input_shape{3, 2, 2, 1};
+  std::vector<float> input_data{
+      -3, -5, 5,  4, 9,  -2, // batch = 0
+      -3, -2, -4, 9, -8, 1,  // batch = 1
   };
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  Shape weights_shape{6, 3};
+  std::vector<float> weights_data{
+      -3, -7, 4,  // unit = 0
+      -4, -6, 4,  // unit = 1
+      3,  5,  2,  // unit = 2
+      3,  -3, -8, // unit = 3
+      -3, 7,  4,  // unit = 4
+      9,  0,  -5, // unit = 5
+  };
+  Shape bias_shape{3};
+  std::vector<float> bias_data{-1, -5, -8};
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
+  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/Greater.cpp b/compiler/luci-interpreter/src/kernels/Greater.cpp
new file mode 100644
index 000000000..f0dd2db36
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Greater.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Greater.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Greater::Greater(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void Greater::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void Greater::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Greater::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreater(op_params, getTensorShape(x()), x_data,
+                                                  getTensorShape(y()), y_data,
+                                                  getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::Greater(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                   y_data, getTensorShape(output()), output_data);
+  }
+}
+
+void Greater::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterWithScaling(op_params, getTensorShape(x()), x_data,
+                                                             getTensorShape(y()), y_data,
+                                                             getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterWithScaling(op_params, getTensorShape(x()), x_data,
+                                              getTensorShape(y()), y_data, getTensorShape(output()),
+                                              output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Greater.h b/compiler/luci-interpreter/src/kernels/Greater.h
new file mode 100644
index 000000000..a65d29f5c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Greater.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_GREATER_H
+#define LUCI_INTERPRETER_KERNELS_GREATER_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Greater : public Kernel
+{
+public:
+  Greater(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int32_t _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int32_t _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_GREATER_H
diff --git a/compiler/luci-interpreter/src/kernels/Greater.test.cpp b/compiler/luci-interpreter/src/kernels/Greater.test.cpp
new file mode 100644
index 000000000..3122fa840
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Greater.test.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Greater.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(GreaterTest, FloatSimple)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+      -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, false, true,  // Row 1
+      true,  false, false, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(GreaterTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+      -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      false, false, true,  // Row 1
+      true,  false, false, // Row 2
+      false, false, true,  // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST(GreaterTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.6, 0.5, // Row 1
+      -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, false, true, true,  // Row 1
+      true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(GreaterTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.6, 0.5, // Row 1
+      -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, false, true, true,  // Row 1
+      true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 3);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
+                                                  x_quant_param.second, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
+                                                  y_quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(GreaterTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+      0.4,  -0.8, 0.7,  0.3, // Row 1
+      -0.5, 0.1,  0,    0.5, // Row 2
+      1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+      -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      true, false, true,  false, // Row 1
+      true, true,  false, false, // Row 2
+      true, false, true,  false, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(GreaterTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(GreaterTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp
new file mode 100644
index 000000000..68135e27c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/GreaterEqual.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+GreaterEqual::GreaterEqual(const Tensor *x, const Tensor *y, Tensor *output)
+    : Kernel({x, y}, {output})
+{
+}
+
+void GreaterEqual::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void GreaterEqual::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void GreaterEqual::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterEqual(op_params, getTensorShape(x()), x_data,
+                                                       getTensorShape(y()), y_data,
+                                                       getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterEqual(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                        y_data, getTensorShape(output()), output_data);
+  }
+}
+
+void GreaterEqual::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
+        op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
+        getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                                   getTensorShape(y()), y_data,
+                                                   getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.h b/compiler/luci-interpreter/src/kernels/GreaterEqual.h
new file mode 100644
index 000000000..e948d698f
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_GREATER_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_GREATER_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class GreaterEqual : public Kernel
+{
+public:
+  GreaterEqual(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int32_t _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int32_t _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_GREATER_EQUAL_H
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
new file mode 100644
index 000000000..11e62644c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/GreaterEqual.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(GreaterEqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+      -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true, true,  // Row 1
+      true,  true, false, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(GreaterEqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+      -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true,  true,  // Row 1
+      true,  false, false, // Row 2
+      false, false, true,  // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST(GreaterEqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.55, 0.5, // Row 1
+      -1,  0.05, 0,    1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true,  true, true,  // Row 1
+      true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(GreaterEqualTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+      0.5, 0.5, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.5,  0.6, 0.5, // Row 1
+      -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true,  true, true,  // Row 1
+      true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
+                                                  x_quant_param.second, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
+                                                  y_quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(GreaterEqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+      0.4,  -0.8, 0.7,  0.3, // Row 1
+      -0.5, 0.1,  0,    0.5, // Row 2
+      1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+      -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      true, false, true, false, // Row 1
+      true, true,  true, false, // Row 2
+      true, false, true, false, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(GreaterEqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(GreaterEqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/If.cpp b/compiler/luci-interpreter/src/kernels/If.cpp
index e6bdee338..ca982d591 100644
--- a/compiler/luci-interpreter/src/kernels/If.cpp
+++ b/compiler/luci-interpreter/src/kernels/If.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "kernels/If.h"
+#include "kernels/Utils.h"
 
 #include <cstring>
 
@@ -40,14 +41,14 @@ If::If(const Tensor *cond, const std::vector<const Tensor *> &inputs, std::vecto
 
 void If::configure()
 {
-  assert(cond()->element_type() == DataType::BOOL);
-  assert(cond()->shape().num_elements() == 1);
+  LUCI_INTERPRETER_CHECK(cond()->element_type() == DataType::BOOL);
+  LUCI_INTERPRETER_CHECK(cond()->shape().num_elements() == 1);
 
   for (RuntimeGraph *graph : {_then_graph, _else_graph})
   {
     (void)graph;
-    assert(graph->getInputTensors().size() == getInputTensors().size() - 1);
-    assert(graph->getOutputTensors().size() == getOutputTensors().size());
+    LUCI_INTERPRETER_CHECK(graph->getInputTensors().size() == getInputTensors().size() - 1);
+    LUCI_INTERPRETER_CHECK(graph->getOutputTensors().size() == getOutputTensors().size());
   }
 }
 
@@ -62,7 +63,7 @@ void If::execute() const
   // Copy kernel inputs to active graph inputs.
   for (size_t i = 0; i < getInputTensors().size() - 1; ++i)
   {
-    assert(graph_inputs[i]->element_type() == input(i)->element_type());
+    LUCI_INTERPRETER_CHECK(graph_inputs[i]->element_type() == input(i)->element_type());
     graph_inputs[i]->resize(input(i)->shape());
 
     const int32_t num_elements = input(i)->shape().num_elements();
@@ -75,7 +76,7 @@ void If::execute() const
   // Copy graph outputs to kernel outputs.
   for (size_t i = 0; i < getOutputTensors().size(); ++i)
   {
-    assert(graph_outputs[i]->element_type() == output(i)->element_type());
+    LUCI_INTERPRETER_CHECK(graph_outputs[i]->element_type() == output(i)->element_type());
     output(i)->resize(graph_outputs[i]->shape());
 
     const int32_t num_elements = output(i)->shape().num_elements();
diff --git a/compiler/luci-interpreter/src/kernels/If.test.cpp b/compiler/luci-interpreter/src/kernels/If.test.cpp
index 9b3857ce3..6967407fb 100644
--- a/compiler/luci-interpreter/src/kernels/If.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/If.test.cpp
@@ -85,7 +85,7 @@ TEST(IfTest, CondTrue)
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorData<float>(output), ElementsAreArray(ArrayFloatNear({6, 9})));
+  EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({6, 9}));
 }
 
 TEST(IfTest, CondFalse)
@@ -103,7 +103,37 @@ TEST(IfTest, CondFalse)
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorData<float>(output), ElementsAreArray(ArrayFloatNear({5, 14})));
+  EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({5, 14}));
+}
+
+TEST(IfTest, InvalidCondType_NEG)
+{
+  Tensor cond = makeInputTensor<DataType::FLOAT32>({1}, {1});
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7});
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2});
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module);
+  RuntimeGraph *else_graph = buildMulSubgraph(&module);
+
+  If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(IfTest, InvalidCondElementNum_NEG)
+{
+  Tensor cond = makeInputTensor<DataType::BOOL>({2}, {false, true});
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7});
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2});
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module);
+  RuntimeGraph *else_graph = buildMulSubgraph(&module);
+
+  If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
+  EXPECT_ANY_THROW(kernel.configure());
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
index cfa535075..0bf133d9c 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
@@ -34,15 +34,16 @@ L2Normalize::L2Normalize(const Tensor *input, Tensor *output, const L2NormParams
 
 void L2Normalize::configure()
 {
-  assert(input()->shape().num_dims() <= 4);
-  assert(output()->element_type() == DataType::FLOAT32 || output()->element_type() == DataType::U8);
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() <= 4);
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32 ||
+                         output()->element_type() == DataType::U8);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   if (output()->element_type() == DataType::U8)
   {
-    assert(output()->scale() == (1. / 128.));
-    assert(output()->zero_point() == 128);
+    LUCI_INTERPRETER_CHECK(output()->scale() == (1. / 128.));
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == 128);
   }
-  assert(params().activation == Activation::NONE);
+  LUCI_INTERPRETER_CHECK(params().activation == Activation::NONE);
   output()->resize(input()->shape());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
index f53eaca94..8f9431182 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -26,11 +26,11 @@ namespace
 
 using namespace testing;
 
-TEST(L2NormalizeTest, Float)
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
-
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   L2NormParams params{};
@@ -40,14 +40,76 @@ TEST(L2NormalizeTest, Float)
   kernel.configure();
   kernel.execute();
 
-  std::vector<float> ref_output_data{-0.55, 0.3, 0.35, 0.6, -0.35, 0.05};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data)
+{
+  std::pair<float, int32_t> quant_param =
+      quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
+                                  std::max(input_data) > 0 ? std::max(input_data) : 0.f);
+
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 128., 128);
+
+  L2NormParams params{};
+  params.activation = Activation::NONE;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class L2NormalizeTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(L2NormalizeTest, DataTypes);
+
+TYPED_TEST(L2NormalizeTest, Simple)
+{
+  Check<TypeParam>({1, 1, 1, 6}, {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1},
+                   {-0.55, 0.3, 0.35, 0.6, -0.35, 0.05});
 }
 
-// TODO Uint8Quantized
-// Implement GetDequantizedOutput Function.
-// Create Test for Uint8 Case
+TEST(L2NormalizeTest, ActivationType_NEG)
+{
+  std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  L2NormParams params{};
+  params.activation = Activation::RELU6;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(L2NormalizeTest, InvalidOutputQuantParam_NEG)
+{
+  std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 1, 1, 6}, 1. / 64., 127, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 64., 127);
+
+  L2NormParams params{};
+  params.activation = Activation::NONE;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
index 37a6ddedc..979364a7f 100644
--- a/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
@@ -36,8 +36,8 @@ L2Pool2D::L2Pool2D(const Tensor *input, Tensor *output, const Pool2DParams &para
 
 void L2Pool2D::configure()
 {
-  assert(input()->shape().num_dims() == 4);
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
 
   int batches = input()->shape().dim(0);
   int height = input()->shape().dim(1);
@@ -55,7 +55,7 @@ void L2Pool2D::configure()
   _padding_height =
       computePadding(params().stride_height, 1, height, params().filter_height, out_height);
 
-  assert(input()->element_type() == DataType::FLOAT32);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
   output()->resize({batches, out_height, out_width, channels_out});
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
index 06bb9388f..5f834e3c1 100644
--- a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
@@ -50,8 +50,7 @@ TEST(L2Pool2DTest, FloatNone)
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.5};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
@@ -78,8 +77,7 @@ TEST(L2Pool2DTest, FloatRelu)
   kernel.execute();
 
   std::vector<float> ref_output_data{3.53553, 6.5};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
@@ -106,8 +104,7 @@ TEST(L2Pool2DTest, FloatRelu1)
   kernel.execute();
 
   std::vector<float> ref_output_data{0.353553, 1.0};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
@@ -134,8 +131,7 @@ TEST(L2Pool2DTest, FloatRelu6)
   kernel.execute();
 
   std::vector<float> ref_output_data{0.353553, 6.0};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
@@ -162,12 +158,11 @@ TEST(L2Pool2DTest, FloatPaddingSame)
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.5};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatPaddingSameSlide1)
+TEST(L2Pool2DTest, FloatPaddingSameStride)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
@@ -190,12 +185,11 @@ TEST(L2Pool2DTest, FloatPaddingSameSlide1)
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.0, 6.5, 5.70088, 2.54951, 7.2111, 8.63134, 7.0};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatPaddingValidSlide1)
+TEST(L2Pool2DTest, FloatPaddingValidStride)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
@@ -218,11 +212,54 @@ TEST(L2Pool2DTest, FloatPaddingValidSlide1)
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.0, 6.5};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   // TODO make a Shape checking of output_tensor.
 }
 
+TEST(L2Pool2DTest, InvalidInputShape_NEG)
+{
+  Shape input_shape{1, 2, 4};
+  std::vector<float> input_data{
+      0, 6, 2,  4, //
+      3, 2, 10, 7, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(L2Pool2DTest, InvalidInputOutputType_NEG)
+{
+  Shape input_shape{1, 2, 4};
+  std::vector<float> input_data{
+      0, 6, 2,  4, //
+      3, 2, 10, 7, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
index 1a26debe0..919b12792 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
@@ -36,7 +36,7 @@ LeakyRelu::LeakyRelu(const Tensor *input, Tensor *output, const LeakyReluParams
 
 void LeakyRelu::configure()
 {
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   if (input()->element_type() == DataType::U8)
   {
     double alpha_multiplier = input()->scale() * params().alpha / output()->scale();
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
index c79d3d6bc..2778549ed 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -28,12 +28,11 @@ using namespace testing;
 
 template <typename T>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
-           std::initializer_list<T> input_data, std::initializer_list<T> output_data, float alpha,
-           DataType element_type)
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data,
+           float alpha)
 {
-  Tensor input_tensor{element_type, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(T));
-
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(element_type);
 
   LeakyReluParams params{};
@@ -44,30 +43,75 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
   kernel.configure();
   kernel.execute();
 
-  (void)output_shape;
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
 }
 
-TEST(LeakReluTest, FloatSimple)
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data, float alpha)
 {
-  Check<float>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 3}, /*input_data=*/
-               {
-                   0.0f, 1.0f, 3.0f,   // Row 1
-                   1.0f, -1.0f, -2.0f, // Row 2
-               },
-               /*output_data=*/
-               {
-                   0.0f, 1.0f, 3.0f,   // Row 1
-                   1.0f, -0.5f, -1.0f, // Row 2
-               },
-               /*alpha=*/0.5f, getElementType<float>());
+  const float quantized_tolerance = getTolerance(-8, 127.f / 16.f, 255);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-8, 127.f / 16.f);
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  LeakyReluParams params{};
+  params.alpha = alpha;
+
+  LeakyRelu kernel(&input_tensor, &output_tensor, params);
+
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+template <typename T> class LeakReluTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(LeakReluTest, DataTypes);
+
+TYPED_TEST(LeakReluTest, Simple)
+{
+  Check<TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 3},
+                   /*input_data=*/
+                   {
+                       0.0f, 1.0f, 3.0f,   // Row 1
+                       1.0f, -1.0f, -2.0f, // Row 2
+                   },
+                   /*output_data=*/
+                   {
+                       0.0f, 1.0f, 3.0f,   // Row 1
+                       1.0f, -0.5f, -1.0f, // Row 2
+                   },
+                   /*alpha=*/0.5f);
 
   SUCCEED();
 }
 
-// TODO Uint8Simple
-// Implement GetDequantizedOutput Function.
-// Create Test for Uint8 Case
+TEST(LeakReluTest, IvalidInputOutputType_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, {
+                                                                       0.0f, 1.0f, 3.0f,   // Row 1
+                                                                       1.0f, -1.0f, -2.0f, // Row 2
+                                                                   });
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  LeakyReluParams params{};
+  params.alpha = 0.5f;
+
+  LeakyRelu kernel(&input_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Less.cpp b/compiler/luci-interpreter/src/kernels/Less.cpp
new file mode 100644
index 000000000..041444926
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Less.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Less.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Less::Less(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void Less::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void Less::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Less::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLess(op_params, getTensorShape(x()), x_data,
+                                               getTensorShape(y()), y_data,
+                                               getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::Less(op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
+                                getTensorShape(output()), output_data);
+  }
+}
+
+void Less::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessWithScaling(op_params, getTensorShape(x()), x_data,
+                                                          getTensorShape(y()), y_data,
+                                                          getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessWithScaling(op_params, getTensorShape(x()), x_data,
+                                           getTensorShape(y()), y_data, getTensorShape(output()),
+                                           output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Less.h b/compiler/luci-interpreter/src/kernels/Less.h
new file mode 100644
index 000000000..fe03e10b1
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Less.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LESS_H
+#define LUCI_INTERPRETER_KERNELS_LESS_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Less : public Kernel
+{
+public:
+  Less(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int32_t _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int32_t _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LESS_H
diff --git a/compiler/luci-interpreter/src/kernels/Less.test.cpp b/compiler/luci-interpreter/src/kernels/Less.test.cpp
new file mode 100644
index 000000000..73aa30b36
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Less.test.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Less.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(LessTest, FloatSimple)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+      -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  false, false, // Row 1
+      false, false, true,  // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(LessTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+      -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  false, false, // Row 1
+      false, true,  true,  // Row 2
+      true,  true,  false, // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST(LessTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.55, 0.5, // Row 1
+      -1,  0.05, 0,    1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  false, false, false, // Row 1
+      false, true,  false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(LessTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.6, 0.5, // Row 1
+      -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  false, false, false, // Row 1
+      false, true,  false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
+                                                  x_quant_param.second, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
+                                                  y_quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(LessTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+      0.4,  -0.8, 0.7,  0.3, // Row 1
+      -0.5, 0.1,  0,    0.5, // Row 2
+      1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+      -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true,  false, true, // Row 1
+      false, false, false, true, // Row 2
+      false, true,  false, true, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(LessTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(LessTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.cpp
new file mode 100644
index 000000000..b8aaba178
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LessEqual.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+LessEqual::LessEqual(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void LessEqual::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void LessEqual::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void LessEqual::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessEqual(op_params, getTensorShape(x()), x_data,
+                                                    getTensorShape(y()), y_data,
+                                                    getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessEqual(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                     y_data, getTensorShape(output()), output_data);
+  }
+}
+
+void LessEqual::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessEqualWithScaling(
+        op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
+        getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                                getTensorShape(y()), y_data,
+                                                getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.h b/compiler/luci-interpreter/src/kernels/LessEqual.h
new file mode 100644
index 000000000..ed4b0f1ea
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LESS_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_LESS_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LessEqual : public Kernel
+{
+public:
+  LessEqual(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int32_t _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int32_t _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LESS_EQUAL_H
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
new file mode 100644
index 000000000..9184c061f
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LessEqual.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(LessEqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+      -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  true, false, // Row 1
+      false, true, true,  // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(LessEqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+      -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  true, false, // Row 1
+      false, true, true,  // Row 2
+      true,  true, false, // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST(LessEqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.55, 0.5, // Row 1
+      -1,  0.05, 0,    1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  true, false, false, // Row 1
+      false, true, false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(LessEqualTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+      0.5, 0.6, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.6,  0.6, 0.5, // Row 1
+      -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  true, false, false, // Row 1
+      false, true, false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
+                                                  x_quant_param.second, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
+                                                  y_quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(LessEqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+      0.4,  -0.8, 0.7,  0.3, // Row 1
+      -0.5, 0.1,  0,    0.5, // Row 2
+      1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+      -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      false, true,  false, true, // Row 1
+      false, false, true,  true, // Row 2
+      false, true,  false, true, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(LessEqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(LessEqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
index 08efa1d6a..b78e27128 100644
--- a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
+++ b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
@@ -36,9 +36,9 @@ LocalResponseNormalization::LocalResponseNormalization(
 
 void LocalResponseNormalization::configure()
 {
-  assert(input()->shape().num_dims() == 4);
-  assert(output()->element_type() == DataType::FLOAT32);
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   output()->resize(input()->shape());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
index 4191bdb29..d98305c1a 100644
--- a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
@@ -44,7 +44,7 @@ TEST(LocalResponseNormalizationTest, SameAsL2Norm)
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05})));
+              FloatArrayNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
 TEST(LocalResponseNormalizationTest, WithAlpha)
@@ -64,7 +64,7 @@ TEST(LocalResponseNormalizationTest, WithAlpha)
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear({-0.275, 0.15, 0.175, 0.3, -0.175, 0.025})));
+              FloatArrayNear({-0.275, 0.15, 0.175, 0.3, -0.175, 0.025}));
 }
 
 TEST(LocalResponseNormalizationTest, WithBias)
@@ -84,7 +84,7 @@ TEST(LocalResponseNormalizationTest, WithBias)
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02})));
+              FloatArrayNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02}));
 }
 
 TEST(LocalResponseNormalizationTest, SmallRadius)
@@ -104,8 +104,39 @@ TEST(LocalResponseNormalizationTest, SmallRadius)
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(
-                  ArrayFloatNear({-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266})));
+              FloatArrayNear({-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266}));
+}
+
+TEST(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
+{
+  Tensor input_tensor =
+      makeInputTensor<DataType::FLOAT32>({1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 0.0;
+  params.alpha = 1.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(LocalResponseNormalizationTest, InvalidInputOutputType_NEG)
+{
+  Tensor input_tensor =
+      makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 0.0;
+  params.alpha = 1.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp b/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp
new file mode 100644
index 000000000..03d13e4ce
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogSoftmax.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+LogSoftmax::LogSoftmax(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void LogSoftmax::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(output()->scale() == 16. / 256);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == 255);
+
+    tflite::SoftmaxParams params{};
+
+    params.table = _table;
+    params.beta = 1.0;
+
+    tflite::optimized_ops::PopulateSoftmaxLookupTable(&params, input()->scale(), params.beta);
+  }
+  output()->resize(input()->shape());
+}
+
+void LogSoftmax::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void LogSoftmax::evalFloat() const
+{
+  tflite::SoftmaxParams params{};
+  tflite::reference_ops::LogSoftmax(params, getTensorShape(input()), getTensorData<float>(input()),
+                                    getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void LogSoftmax::evalQuantized() const
+{
+  const auto input_shape = getTensorShape(input());
+  const auto output_shape = getTensorShape(output());
+  const auto input_scale = input()->scale();
+  uint8_t *output_data = getTensorData<uint8_t>(output());
+  const uint8_t *input_data = getTensorData<uint8_t>(input());
+
+  tflite::SoftmaxParams params{};
+
+  params.table = const_cast<float *>(_table);
+  params.zero_point = output()->zero_point();
+  params.scale = output()->scale();
+
+  tflite::optimized_ops::LogSoftmax(params, input_scale, input_shape, input_data, output_shape,
+                                    output_data);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/LogSoftmax.h b/compiler/luci-interpreter/src/kernels/LogSoftmax.h
new file mode 100644
index 000000000..18477fbe3
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/LogSoftmax.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOGSOFTMAX_H
+#define LUCI_INTERPRETER_KERNELS_LOGSOFTMAX_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LogSoftmax : public Kernel
+{
+public:
+  LogSoftmax(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+  float _table[256];
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOGSOFTMAX_H
diff --git a/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp b/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
new file mode 100644
index 000000000..d3b331dfe
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogSoftmax.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(LogSoftmaxTest, Float)
+{
+  Shape input_shape{2, 4};
+  std::vector<float> input_data{
+      0, -6, 2,  4, //
+      3, -2, 10, 1, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+      -4.14297, -10.14297, -2.14297,   -.142971, //
+      -7.00104, -12.00104, -.00104087, -9.00104, //
+  };
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(LogSoftmaxTest, Uint8)
+{
+  float kMin = -10;
+  float kMax = 10;
+  float kLogSoftmaxQuantizedTolerance = 16. / 256;
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
+  std::vector<float> input_data{
+      0, -6, 2,  4, //
+      3, -2, 10, 1, //
+  };
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+      -4.14297, -10.14297, -2.14297,   -.142971, //
+      -7.00104, -12.00104, -.00104087, -9.00104, //
+  };
+  std::vector<int32_t> ref_output_shape{2, 4};
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kLogSoftmaxQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
+}
+
+TEST(LogSoftmaxTest, InvalidInputOutputType_NEG)
+{
+  std::vector<float> input_data{
+      0, -6, 2,  4, //
+      3, -2, 10, 1, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 4}, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(LogSoftmaxTest, InvalidOutputQuantParam_NEG)
+{
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-10, 10);
+  std::vector<float> input_data{
+      0, -6, 2,  4, //
+      3, -2, 10, 1, //
+  };
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 20. / 256, 255);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.cpp b/compiler/luci-interpreter/src/kernels/Logistic.cpp
index c7d45615c..97d7bf13d 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.cpp
@@ -29,10 +29,10 @@ Logistic::Logistic(const Tensor *input, Tensor *output) : Kernel({input}, {outpu
 
 void Logistic::configure()
 {
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   if (input()->element_type() == DataType::U8)
   {
-    assert(output()->scale() == 1. / 256);
+    LUCI_INTERPRETER_CHECK(output()->scale() == 1. / 256);
     populateLookupTable();
   }
   output()->resize(input()->shape());
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
index 00feddf3d..d3bbb330d 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -26,31 +26,108 @@ namespace
 
 using namespace testing;
 
-TEST(LogisticTest, Float)
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Shape input_shape{1, 2, 4, 1};
-  std::vector<float> input_data{
-      0, -6, 2,  4, //
-      3, -2, 10, 1, //
-  };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor input_tensor = makeInputTensor<getElementType<T>()>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(getElementType<T>());
 
   Logistic kernel(&input_tensor, &output_tensor);
   kernel.configure();
   kernel.execute();
 
-  std::vector<float> ref_output_data{
-      0.5,      0.002473, 0.880797, 0.982014, //
-      0.952574, 0.119203, 0.999955, 0.731059, //
-  };
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
-  // TODO make a Shape checking of output_tensor.
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
-// TODO Uint8
-// Need to Implement GetDequantizedOutput Function.
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data)
+{
+  std::pair<float, int32_t> input_quant_param =
+      quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale() * 2));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class LogisticTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(LogisticTest, DataTypes);
+
+TYPED_TEST(LogisticTest, Simple)
+{
+  Check<TypeParam>(
+      {89}, {89},
+      {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182, -9.0909090909, -8.8636363636,
+       -8.6363636364,  -8.4090909091, -8.1818181818, -7.9545454545, -7.7272727273, -7.5000000000,
+       -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909, -6.3636363636, -6.1363636364,
+       -5.9090909091,  -5.6818181818, -5.4545454545, -5.2272727273, -5.0000000000, -4.7727272727,
+       -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636, -3.6363636364, -3.4090909091,
+       -3.1818181818,  -2.9545454545, -2.7272727273, -2.5000000000, -2.2727272727, -2.0454545455,
+       -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364, -0.9090909091, -0.6818181818,
+       -0.4545454545,  -0.2272727273, 0.0000000000,  0.2272727273,  0.4545454545,  0.6818181818,
+       0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,  1.8181818182,  2.0454545455,
+       2.2727272727,   2.5000000000,  2.7272727273,  2.9545454545,  3.1818181818,  3.4090909091,
+       3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,  4.5454545455,  4.7727272727,
+       5.0000000000,   5.2272727273,  5.4545454545,  5.6818181818,  5.9090909091,  6.1363636364,
+       6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,  7.2727272727,  7.5000000000,
+       7.7272727273,   7.9545454545,  8.1818181818,  8.4090909091,  8.6363636364,  8.8636363636,
+       9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,  10.0000000000},
+      {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729, 0.0001414198,
+       0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396, 0.0004404502, 0.0005527786,
+       0.0006937345, 0.0008706021, 0.0010925128, 0.0013709094, 0.0017201256, 0.0021581065,
+       0.0027073042, 0.0033957870, 0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576,
+       0.0105038445, 0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
+       0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047, 0.1145124805,
+       0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272, 0.2871859014, 0.3358556241,
+       0.3882805886, 0.4434251301, 0.5000000000, 0.5565748699, 0.6117194114, 0.6641443759,
+       0.7128140986, 0.7570113728, 0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195,
+       0.9065929953, 0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
+       0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555, 0.9916136424,
+       0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130, 0.9972926958, 0.9978418935,
+       0.9982798744, 0.9986290906, 0.9989074872, 0.9991293979, 0.9993062655, 0.9994472214,
+       0.9995595498, 0.9996490604, 0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802,
+       0.9998873271, 0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021});
+}
+
+TEST(LogisticTest, IvalidInputOutputType_NEG)
+{
+  Shape input_shape = {1};
+  std::vector<float> input_data{10};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(LogisticTest, IvalidQuantParam_NEG)
+{
+  Shape input_shape = {2};
+  std::vector<float> input_data{-10, 10};
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-10, 10);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 255, 0);
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp b/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp
index afecf9058..123e6e1a2 100644
--- a/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp
@@ -18,6 +18,7 @@
 
 #include "kernels/Utils.h"
 
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
 #include <tensorflow/lite/kernels/internal/reference/pooling.h>
 
 #include <stdexcept>
@@ -35,7 +36,7 @@ MaxPool2D::MaxPool2D(const Tensor *input, Tensor *output, const Pool2DParams &pa
 
 void MaxPool2D::configure()
 {
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   assert(input()->shape().num_dims() == 4);
   const Shape &input_shape = input()->shape();
   const int32_t batches = input_shape.dim(0);
@@ -54,10 +55,15 @@ void MaxPool2D::configure()
       computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
 
   output()->resize({batches, output_height, output_width, depth});
-  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S8)
+  if (input()->element_type() == DataType::U8)
   {
-    assert(input()->scale() == output()->scale());
-    assert(input()->zero_point() == output()->zero_point());
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point());
+  }
+  else if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
   }
 }
 
@@ -71,6 +77,9 @@ void MaxPool2D::execute() const
     case DataType::U8:
       evalQuantized();
       break;
+    case DataType::S16:
+      evalSInt16();
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
@@ -116,5 +125,26 @@ void MaxPool2D::evalQuantized() const
                                  getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
+void MaxPool2D::evalSInt16() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_integer_ops::MaxPool(
+      params, getTensorShape(input()), getTensorData<int16_t>(input()), //
+      getTensorShape(output()), getTensorData<int16_t>(output()));
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/MaxPool2D.h b/compiler/luci-interpreter/src/kernels/MaxPool2D.h
index 7a59ff022..bb7666305 100644
--- a/compiler/luci-interpreter/src/kernels/MaxPool2D.h
+++ b/compiler/luci-interpreter/src/kernels/MaxPool2D.h
@@ -39,6 +39,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalSInt16() const;
 
 private:
   int32_t _padding_height{};
diff --git a/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp b/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
index 390255d89..1d7fe06c4 100644
--- a/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
@@ -54,8 +54,7 @@ TEST(MaxPool2DTest, Float)
       5, 6, //
   };
   std::initializer_list<int32_t> ref_output_shape{1, 2, 2, 1};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
@@ -66,11 +65,9 @@ TEST(MaxPool2DTest, Uint8)
       0,  -6, 12, 4, //
       -3, -2, 10, 7, //
   };
-  Tensor input_tensor{DataType::U8, {1, 2, 4, 1}, {{quant_param.first}, {quant_param.second}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
-  std::vector<uint8_t> quantize_input =
-      quantize<uint8_t>(input_data, quant_param.first, quant_param.second);
-  input_tensor.writeData(quantize_input.data(), quantize_input.size() * sizeof(uint8_t));
 
   Pool2DParams params{};
   params.padding = Padding::VALID;
@@ -86,12 +83,43 @@ TEST(MaxPool2DTest, Uint8)
 
   std::vector<float> ref_output_data{0.0, 6.0};
   std::initializer_list<int32_t> ref_output_shape{1, 1, 2, 1};
-  EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                                  output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
+TEST(MaxPool2DTest, SInt16)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
+  std::vector<float> input_data{
+      1,  -1, 0,  -2, 2,  //
+      -7, -6, -5, -4, -3, //
+      5,  4,  3,  6,  7,  //
+  };
+  std::vector<float> ref_output_data{
+      1, 2, //
+      5, 6, //
+  };
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.2, 0, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  MaxPool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Maximum.cpp b/compiler/luci-interpreter/src/kernels/Maximum.cpp
new file mode 100644
index 000000000..c522b0706
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Maximum.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Maximum.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Maximum::Maximum(const Tensor *input1, const Tensor *input2, Tensor *output)
+    : Kernel({input1, input2}, {output})
+{
+}
+
+void Maximum::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type())
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Maximum::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalMaximum<float>();
+      break;
+    case DataType::U8:
+      evalMaximum<uint8_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> inline void Maximum::evalMaximum() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                        getTensorShape(input2()), getTensorData<T>(input2()),
+                        getTensorShape(output()), getTensorData<T>(output()),
+                        [](T x, T y) { return std::max(x, y); });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Maximum.h b/compiler/luci-interpreter/src/kernels/Maximum.h
new file mode 100644
index 000000000..3c99e69c7
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Maximum.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MAXIMUM_H
+#define LUCI_INTERPRETER_KERNELS_MAXIMUM_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Maximum : public Kernel
+{
+public:
+  Maximum(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> inline void evalMaximum() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MAXIMUM_H
diff --git a/compiler/luci-interpreter/src/kernels/Maximum.test.cpp b/compiler/luci-interpreter/src/kernels/Maximum.test.cpp
new file mode 100644
index 000000000..2ddaeaf04
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Maximum.test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Maximum.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(MaximumTest, Float)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
+  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data2);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Maximum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{1.0, 0.0, 1.0, 12.0, -2.0, -1.43};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(MaximumTest, Uint8)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<uint8_t> input_data1{1, 0, 2, 11, 2, 23};
+  std::vector<uint8_t> input_data2{0, 0, 1, 12, 255, 1};
+  Tensor input_tensor1 = makeInputTensor<DataType::U8>(input_shape, input_data1);
+  Tensor input_tensor2 = makeInputTensor<DataType::U8>(input_shape, input_data2);
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Maximum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<int32_t> ref_output_shape{2, 4};
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({1, 0, 2, 12, 255, 23}));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Mean.cpp b/compiler/luci-interpreter/src/kernels/Mean.cpp
index 2394e2c0e..7d022eaf8 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mean.cpp
@@ -130,8 +130,13 @@ Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, const Reduce
 
 void Mean::configure()
 {
-  assert(input()->element_type() == output()->element_type());
-  assert(axes()->element_type() == DataType::S32);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(axes()->element_type() == DataType::S32);
+  if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+  }
+
   const Shape &input_shape = input()->shape();
   int input_num_dims = input_shape.num_dims();
 
@@ -169,6 +174,9 @@ void Mean::execute() const
     case DataType::U8:
       evalQuantized();
       break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
@@ -245,5 +253,74 @@ void Mean::evalQuantized() const
   }
 }
 
+void Mean::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  const int num_axes = axes()->shape().num_elements();
+
+  constexpr int32_t output_min = -std::numeric_limits<int16_t>::max();
+  constexpr int32_t output_max = std::numeric_limits<int16_t>::max();
+
+  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+  if (_params.keep_dims && input_shape.num_dims() == 4 && num_axes == 2 &&
+      ((axes_data[0] == 1 && axes_data[1] == 2) || (axes_data[0] == 2 && axes_data[1] == 1)))
+  {
+    const int32_t batches = input_shape.dim(0);
+    const int32_t input_height = input_shape.dim(1);
+    const int32_t input_width = input_shape.dim(2);
+    const int32_t depth = input_shape.dim(3);
+    assert(output_shape.num_dims() == 4);
+    assert(output_shape.dim(0) == batches);
+    assert(output_shape.dim(1) == 1);
+    assert(output_shape.dim(2) == 1);
+    assert(output_shape.dim(3) == depth);
+
+    const double real_multiplier =
+        static_cast<double>(input()->scale()) / static_cast<double>(output()->scale());
+
+    int32_t output_multiplier{};
+    int output_shift{};
+    quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+    const int32_t num_elements_in_axes = input_height * input_width;
+
+    for (int32_t batch = 0; batch < batches; ++batch)
+    {
+      for (int32_t c = 0; c < depth; ++c)
+      {
+        int32_t acc = 0;
+        for (int32_t in_y = 0; in_y < input_height; ++in_y)
+        {
+          for (int32_t in_x = 0; in_x < input_width; ++in_x)
+          {
+            acc += input_data[calcOffset(input_shape, batch, in_y, in_x, c)];
+          }
+        }
+        int32_t scaled_acc =
+            tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+        // Divide by the number of elements rounding to the nearest integer.
+        scaled_acc = scaled_acc > 0
+                         ? (scaled_acc + num_elements_in_axes / 2) / num_elements_in_axes
+                         : (scaled_acc - num_elements_in_axes / 2) / num_elements_in_axes;
+
+        scaled_acc = std::max(scaled_acc, output_min);
+        scaled_acc = std::min(scaled_acc, output_max);
+
+        output_data[calcOffset(output_shape, batch, 0, 0, c)] = scaled_acc;
+      }
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported configuration.");
+  }
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Mean.h b/compiler/luci-interpreter/src/kernels/Mean.h
index 9cc793c72..1cc046894 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.h
+++ b/compiler/luci-interpreter/src/kernels/Mean.h
@@ -42,6 +42,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalQuantizedS16() const;
 
 private:
   std::unique_ptr<Tensor> _temp_index;
diff --git a/compiler/luci-interpreter/src/kernels/Mean.test.cpp b/compiler/luci-interpreter/src/kernels/Mean.test.cpp
index f4e411ca4..e81d2ad5f 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mean.test.cpp
@@ -47,8 +47,7 @@ TEST(MeanTest, FloatKeepDims)
 
   std::vector<float> ref_output_data{10.5, 12.5, 14.5};
   std::initializer_list<int32_t> ref_output_shape{1, 3, 1};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
@@ -72,8 +71,7 @@ TEST(MeanTest, FloatKeepDims4DMean)
 
   std::vector<float> ref_output_data{6, 7, 18, 19};
   std::initializer_list<int32_t> ref_output_shape{2, 1, 1, 2};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
@@ -97,8 +95,7 @@ TEST(MeanTest, FloatNotKeepDims)
 
   std::vector<float> ref_output_data{12, 13};
   std::initializer_list<int32_t> ref_output_shape{2};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
@@ -109,12 +106,10 @@ TEST(MeanTest, Uint8KeepDims)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
 
   std::vector<int32_t> axis_data{1};
-  Tensor input_tensor{DataType::U8, {3, 2}, {{quant_param.first}, {quant_param.second}}, ""};
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second, input_data);
   Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
-  std::vector<uint8_t> quantize_input =
-      quantize<uint8_t>(input_data, quant_param.first, quant_param.second);
-  input_tensor.writeData(quantize_input.data(), quantize_input.size() * sizeof(uint8_t));
 
   ReducerParams params{};
   params.keep_dims = true;
@@ -125,9 +120,8 @@ TEST(MeanTest, Uint8KeepDims)
 
   std::vector<float> ref_output_data{0.3, 0.35, 0.55};
   std::initializer_list<int32_t> ref_output_shape{3, 1};
-  EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                                  output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(ref_output_data, kQuantizedTolerance)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
@@ -138,12 +132,10 @@ TEST(MeanTest, Uint8NotKeepDims)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
 
   std::vector<int32_t> axis_data{1};
-  Tensor input_tensor{DataType::U8, {1, 3, 2}, {{quant_param.first}, {quant_param.second}}, ""};
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>({1, 3, 2}, quant_param.first, quant_param.second, input_data);
   Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
-  std::vector<uint8_t> quantize_input =
-      quantize<uint8_t>(input_data, quant_param.first, quant_param.second);
-  input_tensor.writeData(quantize_input.data(), quantize_input.size() * sizeof(uint8_t));
 
   ReducerParams params{};
   params.keep_dims = false;
@@ -154,12 +146,34 @@ TEST(MeanTest, Uint8NotKeepDims)
 
   std::vector<float> ref_output_data{0.4, 0.4};
   std::initializer_list<int32_t> ref_output_shape{1, 2};
-  EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                                  output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(ref_output_data, kQuantizedTolerance)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
+TEST(MeanTest, SInt16KeepDims4D)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  std::vector<int32_t> axes_data{1, 2};
+  std::vector<float> ref_output_data{6, 7, 18, 19};
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>({2, 2, 3, 2}, 0.25, 0, input_data);
+  Tensor axes_tensor = makeInputTensor<DataType::S32>({2}, axes_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  Mean kernel(&input_tensor, &axes_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Minimum.cpp b/compiler/luci-interpreter/src/kernels/Minimum.cpp
new file mode 100644
index 000000000..5eb13455e
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Minimum.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Minimum.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Minimum::Minimum(const Tensor *input1, const Tensor *input2, Tensor *output)
+    : Kernel({input1, input2}, {output})
+{
+}
+
+void Minimum::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type())
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Minimum::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalMinimum<float>();
+      break;
+    case DataType::U8:
+      evalMinimum<uint8_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> inline void Minimum::evalMinimum() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                        getTensorShape(input2()), getTensorData<T>(input2()),
+                        getTensorShape(output()), getTensorData<T>(output()),
+                        [](T x, T y) { return std::min(x, y); });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Minimum.h b/compiler/luci-interpreter/src/kernels/Minimum.h
new file mode 100644
index 000000000..5ff4035b4
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Minimum.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MINIMUM_H
+#define LUCI_INTERPRETER_KERNELS_MINIMUM_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Minimum : public Kernel
+{
+public:
+  Minimum(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> inline void evalMinimum() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MINIMUM_H
diff --git a/compiler/luci-interpreter/src/kernels/Minimum.test.cpp b/compiler/luci-interpreter/src/kernels/Minimum.test.cpp
new file mode 100644
index 000000000..b6420dd9b
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Minimum.test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Minimum.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(MinimumTest, Float)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
+  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data2);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Minimum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{-1.0, 0.0, -1.0, 11.0, -3.0, -1.44};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(MinimumTest, Uint8)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<uint8_t> input_data1{1, 0, 2, 11, 2, 23};
+  std::vector<uint8_t> input_data2{0, 0, 1, 12, 255, 1};
+  Tensor input_tensor1 = makeInputTensor<DataType::U8>(input_shape, input_data1);
+  Tensor input_tensor2 = makeInputTensor<DataType::U8>(input_shape, input_data2);
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Minimum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<int32_t> ref_output_shape{2, 4};
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({0, 0, 1, 11, 2, 1}));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Mul.test.cpp b/compiler/luci-interpreter/src/kernels/Mul.test.cpp
index f2255ac3f..fbda3bece 100644
--- a/compiler/luci-interpreter/src/kernels/Mul.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mul.test.cpp
@@ -56,8 +56,7 @@ TEST(MulTest, Float)
     kernel.configure();
     kernel.execute();
 
-    EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ::testing::ElementsAreArray(ArrayFloatNear(test_outputs[i], 0.0001f)))
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
         << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
@@ -74,8 +73,7 @@ TEST(MulTest, Float)
     kernel.configure();
     kernel.execute();
 
-    EXPECT_THAT(extractTensorData<float>(output_tensor),
-                ::testing::ElementsAreArray(ArrayFloatNear(test_outputs[i], 0.0001f)))
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
         << "With shape number " << i;
   }
 }
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.cpp
new file mode 100644
index 000000000..cd2f6c2c1
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/NotEqual.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+NotEqual::NotEqual(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void NotEqual::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void NotEqual::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void NotEqual::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowNotEqual(op_params, getTensorShape(x()), x_data,
+                                                   getTensorShape(y()), y_data,
+                                                   getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::NotEqual(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                    y_data, getTensorShape(output()), output_data);
+  }
+}
+
+void NotEqual::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowNotEqualWithScaling(
+        op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
+        getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::NotEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                               getTensorShape(y()), y_data,
+                                               getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.h b/compiler/luci-interpreter/src/kernels/NotEqual.h
new file mode 100644
index 000000000..d729c6c14
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_NOT_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_NOT_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class NotEqual : public Kernel
+{
+public:
+  NotEqual(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int32_t _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int32_t _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_NOT_EQUAL_H
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
new file mode 100644
index 000000000..8c8712371
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/NotEqual.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(NotEqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+      -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true, false, true, // Row 1
+      true, false, true, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(NotEqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+      0.5, 0.7, 0.9, // Row 1
+      1,   0,   -1,  // Row 2
+      -1,  0,   1,   // Row 3
+      0.9, 0.7, 0.5, // Row 4
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  false, true,  // Row 1
+      true,  true,  true,  // Row 2
+      true,  true,  true,  // Row 3
+      false, false, false, // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data);
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST(NotEqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+      0.5, 0.5, 0.7,  0.9, // Row 1
+      1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+      0.9, 0.5, 0.55, 0.5, // Row 1
+      -1,  0,   0.05, 1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+      true, false, true,  true, // Row 1
+      true, false, false, true, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
+                                                  x_quant_param.second, x_data);
+
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
+                                                  y_quant_param.second, y_data);
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(NotEqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+      0.4,  -0.8, 0.7,  0.3, // Row 1
+      -0.5, 0.1,  0,    0.5, // Row 2
+      1,    0,    0.05, -1,  // Row 3
+      -1,   0.05, 0,    1,   // Row 4
+  };
+
+  std::vector<float> y_data{
+      -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+      true,  true,  true,  true,  // Row 1
+      true,  true,  false, true,  // Row 2
+      true,  true,  true,  true,  // Row 3
+      false, false, false, false, // Row 4
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor =
+      makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
+  Tensor y_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST(NotEqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(NotEqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Pad.test.cpp b/compiler/luci-interpreter/src/kernels/Pad.test.cpp
index 15fcd0da3..4bee07629 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.test.cpp
@@ -34,12 +34,10 @@ TEST(Pad, Uint8)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
   std::vector<float> input_data{-0.8, 0.2, 0.9, 0.7, 0.1, -0.3};
   std::vector<int32_t> paddings_data{0, 0, 0, 2, 1, 3, 0, 0};
-  Tensor input_tensor{DataType::U8, {1, 2, 3, 1}, {{quant_param.first}, {quant_param.second}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
   Tensor paddings_tensor = makeInputTensor<DataType::S32>({4, 2}, paddings_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
-  std::vector<uint8_t> quantize_input =
-      quantize<uint8_t>(input_data, quant_param.first, quant_param.second);
-  input_tensor.writeData(quantize_input.data(), quantize_input.size() * sizeof(uint8_t));
 
   Pad kernel(&input_tensor, &paddings_tensor, &output_tensor);
   kernel.configure();
@@ -47,9 +45,8 @@ TEST(Pad, Uint8)
 
   std::vector<float> ref_output_data{0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
                                      0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0};
-  EXPECT_THAT(dequantize(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                         output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(ref_output_data, kQuantizedTolerance)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 7, 1}));
 }
 
@@ -69,8 +66,7 @@ TEST(Pad, Float)
                                      0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 4, 5,
                                      6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   std::initializer_list<int32_t> ref_output_shape{2, 4, 6, 1};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Pow.cpp b/compiler/luci-interpreter/src/kernels/Pow.cpp
new file mode 100644
index 000000000..afc10b80e
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Pow.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pow.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Pow::Pow(const Tensor *input1, const Tensor *input2, Tensor *output)
+    : Kernel({input1, input2}, {output})
+{
+}
+
+void Pow::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Pow::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      eval<float>();
+      break;
+    case DataType::S32:
+      eval<int32_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void Pow::eval() const
+{
+  tflite::ArithmeticParams params{};
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+      getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastPow4DSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                                              getTensorShape(input2()), getTensorData<T>(input2()),
+                                              getTensorShape(output()), getTensorData<T>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Pow(getTensorShape(input1()), getTensorData<T>(input1()),
+                               getTensorShape(input2()), getTensorData<T>(input2()),
+                               getTensorShape(output()), getTensorData<T>(output()));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Pow.h b/compiler/luci-interpreter/src/kernels/Pow.h
new file mode 100644
index 000000000..8ff865e40
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Pow.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_POW_H
+#define LUCI_INTERPRETER_KERNELS_POW_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Pow : public Kernel
+{
+public:
+  Pow(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void eval() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_POW_H
diff --git a/compiler/luci-interpreter/src/kernels/Pow.test.cpp b/compiler/luci-interpreter/src/kernels/Pow.test.cpp
new file mode 100644
index 000000000..69d8946c8
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Pow.test.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pow.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(PowTest, SimplePow)
+{
+  std::initializer_list<int32_t> base_shape = {1, 1, 3, 2};
+
+  std::vector<float> input1_data{0.3f, 2.3f, 0.9f, 0.5f, 0.8f, 1.1f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  std::vector<float> test_outputs{0.786f, 1.2838f, 1.043f, 0.7071f, 0.8f, 1.08956f};
+
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(base_shape));
+}
+
+TEST(PowTest, FloatBroadcastPow)
+{
+  std::initializer_list<int32_t> input1_shape = {1, 3};
+  std::initializer_list<int32_t> input2_shape = {3, 1};
+
+  std::vector<float> input1_data{0.3f, 2.3f, 0.9f};
+  std::vector<float> input2_data{0.2f, 0.3f, 0.4f};
+  std::vector<float> test_outputs{0.786f,   1.18126f, 0.9791f, 0.6968f, 1.28386f,
+                                  0.96888f, 0.6178f,  1.3953f, 0.9587f};
+
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+}
+
+TEST(PowTest, IntPow)
+{
+  std::initializer_list<int32_t> base_shape = {1, 3};
+
+  std::vector<int32_t> input_data{2, 3, 4};
+  std::vector<int32_t> test_outputs{4, 27, 256};
+
+  Tensor input1_tensor = makeInputTensor<DataType::S32>(base_shape, input_data);
+  Tensor input2_tensor = makeInputTensor<DataType::S32>(base_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(test_outputs));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(base_shape));
+}
+
+TEST(PowTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f});
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {4});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Prelu.cpp b/compiler/luci-interpreter/src/kernels/Prelu.cpp
new file mode 100644
index 000000000..e658d87b5
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Prelu.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Prelu.h"
+
+#include "kernels/BinaryOpCommon.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Prelu::Prelu(const Tensor *input, const Tensor *alpha, Tensor *output)
+    : Kernel({input, alpha}, {output})
+{
+}
+
+void Prelu::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(alpha()->element_type() == output()->element_type());
+
+  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
+  {
+    if (input()->element_type() == DataType::S16)
+    {
+      LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && alpha()->zero_point() == 0 &&
+                             output()->zero_point() == 0);
+    }
+    double alpha_multiplier = input()->scale() * alpha()->scale() / output()->scale();
+    quantizeMultiplier(alpha_multiplier, &_output_multiplier_alpha, &_output_shift_alpha);
+    double identity_multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(identity_multiplier, &_output_multiplier_identity, &_output_shift_identity);
+  }
+  output()->resize(calculateShapeForBroadcast(input()->shape(), alpha()->shape()));
+}
+
+void Prelu::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Prelu::evalFloat() const
+{
+  const auto input_data = getTensorData<float>(input());
+  const auto alpha_data = getTensorData<float>(alpha());
+  const auto size = getTensorShape(input()).FlatSize();
+  auto output_data = getTensorData<float>(output());
+
+  auto PreluFunc = [](float input, float alpha) { return input >= 0.0 ? input : input * alpha; };
+
+  if (input()->shape() != alpha()->shape())
+  {
+    tflite::reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+        getTensorShape(input()), getTensorData<float>(input()), getTensorShape(alpha()),
+        getTensorData<float>(alpha()), getTensorShape(output()), getTensorData<float>(output()),
+        PreluFunc);
+  }
+  else
+  {
+    for (auto i = decltype(size){0}; i < size; ++i)
+    {
+      if (input_data[i] >= 0)
+        output_data[i] = input_data[i];
+      else
+        output_data[i] = input_data[i] * alpha_data[i];
+    }
+  }
+}
+
+void Prelu::evalQuantized() const
+{
+  tflite::PreluParams op_params{};
+
+  op_params.input_offset = -input()->zero_point(); // Note the '-'.
+  op_params.alpha_offset = -alpha()->zero_point(); // Note the '-'.
+  op_params.output_offset = output()->zero_point();
+  op_params.output_shift_1 = _output_shift_identity;
+  op_params.output_multiplier_1 = _output_multiplier_identity;
+  op_params.output_shift_2 = _output_shift_alpha;
+  op_params.output_multiplier_2 = _output_multiplier_alpha;
+
+  if (input()->shape() != alpha()->shape())
+  {
+    tflite::reference_ops::BroadcastPrelu4DSlow(
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+        getTensorShape(alpha()), getTensorData<uint8_t>(alpha()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Prelu<uint8_t>(op_params, getTensorShape(input()),
+                                          getTensorData<uint8_t>(input()), getTensorShape(alpha()),
+                                          getTensorData<uint8_t>(alpha()), getTensorShape(output()),
+                                          getTensorData<uint8_t>(output()));
+  }
+}
+
+void Prelu::evalQuantizedS16() const
+{
+  constexpr int32_t quantized_min = std::numeric_limits<int16_t>::min();
+  constexpr int32_t quantized_max = std::numeric_limits<int16_t>::max();
+
+  auto fn = [this, quantized_min, quantized_max](int16_t input_val, int16_t alpha_val) {
+    const int32_t output_val =
+        input_val >= 0
+            ? tflite::MultiplyByQuantizedMultiplier(input_val, _output_multiplier_identity,
+                                                    _output_shift_identity)
+            : tflite::MultiplyByQuantizedMultiplier(input_val * alpha_val, _output_multiplier_alpha,
+                                                    _output_shift_alpha);
+    const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_val));
+    return static_cast<int16_t>(clamped_output);
+  };
+
+  BinaryOpBroadcastSlow(getTensorShape(input()), getTensorData<int16_t>(input()),
+                        getTensorShape(alpha()), getTensorData<int16_t>(alpha()),
+                        getTensorShape(output()), getTensorData<int16_t>(output()), fn);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Prelu.h b/compiler/luci-interpreter/src/kernels/Prelu.h
new file mode 100644
index 000000000..c7911a63f
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Prelu.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_PRELU_H
+#define LUCI_INTERPRETER_KERNELS_PRELU_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Prelu : public Kernel
+{
+public:
+  Prelu(const Tensor *input, const Tensor *alpha, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *alpha() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedS16() const;
+
+private:
+  int32_t _output_multiplier_alpha = 0;
+  int32_t _output_shift_alpha = 0;
+  int32_t _output_multiplier_identity = 0;
+  int32_t _output_shift_identity = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_PRELU_H
diff --git a/compiler/luci-interpreter/src/kernels/Prelu.test.cpp b/compiler/luci-interpreter/src/kernels/Prelu.test.cpp
new file mode 100644
index 000000000..30702c826
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Prelu.test.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Prelu.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> alpha_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
+           std::initializer_list<T> alpha_data, std::initializer_list<T> output_data)
+{
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor alpha_tensor = makeInputTensor<element_type>(alpha_shape, alpha_data);
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(PreluTest, FloatSimple)
+{
+  Check<float>(/*input_shape=*/{2, 3}, /*alpha_shape=*/{2, 3},
+               /*output_shape=*/{2, 3},
+               /*input_data=*/
+               {
+                   0.0f, 1.0f, 3.0f,   // Row 1
+                   1.0f, -1.0f, -2.0f, // Row 2
+               },
+               /*alpha_data=*/
+               {
+                   0.0f, 0.5f, 0.1f, // Row 1
+                   0.0f, 0.5f, 0.1f, // Row 2
+               },
+               /*output_data=*/
+               {
+                   0.0f, 1.0f, 3.0f,   // Row 1
+                   1.0f, -0.5f, -0.2f, // Row 2
+               });
+
+  SUCCEED();
+}
+
+TEST(PreluTest, FloatBroadcast)
+{
+  Check<float>(/*input_shape=*/{1, 2, 2, 3}, /*alpha_shape=*/{1, 1, 3},
+               /*output_shape=*/{1, 2, 2, 3},
+               /*input_data=*/
+               {
+                   0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                   1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                   -1.0f, -1.0f, -1.0f, // Row 2, Column 1
+                   -2.0f, -2.0f, -2.0f, // Row 2, Column 2
+               },
+               /*alpha_data=*/
+               {0.0f, 1.0f, 2.0f},
+               /*output_data=*/
+               {
+                   0.0f, 0.0f, 0.0f,   // Row 1, Column 1
+                   1.0f, 1.0f, 1.0f,   // Row 1, Column 2
+                   0.0f, -1.0f, -2.0f, // Row 2, Column 1
+                   0.0f, -2.0f, -4.0f, // Row 2, Column 2
+               });
+
+  SUCCEED();
+}
+
+float GetTolerance(float min, float max) { return (max - min) / 255.0; }
+
+TEST(PreluTest, Uint8Simple)
+{
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, 0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.5f, 0.5f, 0.25f, 1.0f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, 0.7f, 0.1f, -0.1f};
+
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first,
+                                                      quant_param.second, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 3, 1}));
+
+  SUCCEED();
+}
+
+TEST(PreluTest, Uint8Broadcast)
+{
+  std::vector<float> input_data{
+      0.0f,   0.0f,   0.0f,   // Row 1, Column 1
+      0.5f,   0.5f,   0.5f,   // Row 1, Column 2
+      -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f, // Row 2, Column 2
+  };
+  std::vector<float> alpha_data{0.0f, 0.5f, -0.5f};
+  std::vector<float> ref_output_data{
+      0.0f, 0.0f,    0.0f,  // Row 1, Column 1
+      0.5f, 0.5f,    0.5f,  // Row 1, Column 2
+      0.0f, -0.5f,   0.5f,  // Row 2, Column 1
+      0.0f, -0.125f, 0.125f // Row 2, Column 2
+  };
+  std::vector<float> ref_quant_output_data{
+      128, 128, 128, // Row 1, Column 1
+      192, 192, 192, // Row 1, Column 2
+      128, 64,  192, // Row 2, Column 1
+      128, 112, 144  // Row 2, Column 2
+  };
+  float kQuantizedTolerance = 2 * (1. / 256);
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 3}, quant_param.first,
+                                                      quant_param.second, input_data);
+  Tensor alpha_tensor =
+      makeInputTensor<DataType::U8>({1, 1, 3}, quant_param.first, quant_param.second, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 3}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_quant_output_data));
+}
+
+TEST(PreluTest, SInt16Simple)
+{
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, 0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.5f, 0.5f, 0.25f, 1.0f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, 0.7f, 0.1f, -0.1f};
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, input_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(PreluTest, SInt16Broadcast)
+{
+  std::vector<float> input_data{
+      0.0f,   0.0f,   0.0f,   // Row 1, Column 1
+      0.5f,   0.5f,   0.5f,   // Row 1, Column 2
+      -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f, // Row 2, Column 2
+  };
+  std::vector<float> alpha_data{0.0f, 0.5f, -0.5f};
+  std::vector<float> ref_output_data{
+      0.0f, 0.0f,    0.0f,  // Row 1, Column 1
+      0.5f, 0.5f,    0.5f,  // Row 1, Column 2
+      0.0f, -0.5f,   0.5f,  // Row 2, Column 1
+      0.0f, -0.125f, 0.125f // Row 2, Column 2
+  };
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 2, 3}, 0.01, 0, input_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 3}, 0.1, 0, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.001, 0);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 3}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(PreluTest, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor alpha_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, Input_Alpha_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, Invalid_Input_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor alpha_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Relu.cpp b/compiler/luci-interpreter/src/kernels/Relu.cpp
new file mode 100644
index 000000000..a2e02d708
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Relu.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Relu::Relu(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Relu::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+  }
+
+  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
+  {
+    double multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(multiplier, &_output_multiplier, &_output_shift);
+  }
+  output()->resize(input()->shape());
+}
+
+void Relu::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Relu::evalFloat() const
+{
+  const auto input_data = getTensorData<float>(input());
+  const auto input_shape = getTensorShape(input());
+  auto output_data = getTensorData<float>(output());
+  auto output_shape = getTensorShape(output());
+
+  tflite::optimized_ops::Relu(input_shape, input_data, output_shape, output_data);
+}
+
+void Relu::evalQuantized() const
+{
+  tflite::ReluParams params;
+  params.input_offset = input()->zero_point();
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = _output_multiplier;
+  params.output_shift = _output_shift;
+
+  params.quantized_activation_min =
+      std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
+  params.quantized_activation_max = static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
+
+  tflite::optimized_ops::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+void Relu::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  constexpr int32_t output_min = 0;
+  constexpr int32_t output_max = std::numeric_limits<int16_t>::max();
+
+  const int32_t num_elements = input()->shape().num_elements();
+
+  for (int32_t i = 0; i < num_elements; ++i)
+  {
+    const int32_t input_val = input_data[i];
+    int32_t output_val =
+        tflite::MultiplyByQuantizedMultiplier(input_val, _output_multiplier, _output_shift);
+    output_val = std::max(output_val, output_min);
+    output_val = std::min(output_val, output_max);
+    output_data[i] = static_cast<int16_t>(output_val);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Relu.h b/compiler/luci-interpreter/src/kernels/Relu.h
new file mode 100644
index 000000000..b813f0cdf
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Relu.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RELU_H
+#define LUCI_INTERPRETER_KERNELS_RELU_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Relu : public Kernel
+{
+public:
+  Relu(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedS16() const;
+
+private:
+  int32_t _output_multiplier{0};
+  int32_t _output_shift{0};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RELU_H
diff --git a/compiler/luci-interpreter/src/kernels/Relu.test.cpp b/compiler/luci-interpreter/src/kernels/Relu.test.cpp
new file mode 100644
index 000000000..cabefa733
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Relu.test.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(ReluTest, FloatSimple)
+{
+  std::vector<float> input_data{
+      0.0f, 1.0f,  3.0f,  // Row 1
+      1.0f, -1.0f, -2.0f, // Row 2
+  };
+
+  std::vector<float> ref_output_data{
+      0.0f, 1.0f, 3.0f, // Row 1
+      1.0f, 0.0f, 0.0f, // Row 2
+  };
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(ReluTest, Uint8Quantized)
+{
+  std::vector<float> input_data{
+      0, -6, 2, 4, //
+      3, -2, 7, 1, //
+  };
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float f_min = (-128.0 / 128.0) * 8;
+  const float f_max = (127.0 / 128.0) * 8;
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({128, 128, 160, 192, 176, 128, 240, 144}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0, 0, 2, 4, 3, 0, 7, 1}));
+}
+
+TEST(ReluTest, Uint8Requantized)
+{
+  std::vector<float> input_data{
+      0, -6, 2, 4, //
+      3, -2, 7, 1, //
+  };
+
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float in_min = (-128.0 / 128.0) * 8;
+  const float in_max = (127.0 / 128.0) * 8;
+  const float out_min = (0.0 / 256.0) * 8;
+  const float out_max = (255.0 / 256.0) * 8;
+
+  std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first,
+                                                      quant_input.second, input_data);
+
+  std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({0, 0, 64, 128, 96, 0, 224, 32}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0, 0, 2, 4, 3, 0, 7, 1}));
+}
+
+TEST(ReluTest, SInt16)
+{
+  std::vector<float> input_data{
+      0, -6, 2, 4, //
+      3, -2, 7, 1, //
+  };
+  std::vector<float> ref_output_data{
+      0, 0, 2, 4, //
+      3, 0, 7, 1, //
+  };
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 4, 1}, 0.5, 0, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.25, 0);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(ReluTest, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ReluTest, Invalid_Input_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.cpp b/compiler/luci-interpreter/src/kernels/Relu6.cpp
new file mode 100644
index 000000000..1046ef27b
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Relu6.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu6.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Relu6::Relu6(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Relu6::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  if (input()->element_type() == DataType::U8)
+  {
+    double multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(multiplier, &_output_multiplier, &_output_shift);
+  }
+  output()->resize(input()->shape());
+}
+
+void Relu6::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Relu6::evalFloat() const
+{
+  const auto input_data = getTensorData<float>(input());
+  const auto input_shape = getTensorShape(input());
+  auto output_data = getTensorData<float>(output());
+  auto output_shape = getTensorShape(output());
+
+  tflite::optimized_ops::Relu6(input_shape, input_data, output_shape, output_data);
+}
+
+void Relu6::evalQuantized() const
+{
+  tflite::ReluParams params;
+  params.input_offset = input()->zero_point();
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = _output_multiplier;
+  params.output_shift = _output_shift;
+
+  params.quantized_activation_min =
+      std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
+  params.quantized_activation_max =
+      std::min(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()),
+               params.output_offset + static_cast<int32>(roundf(6.f / output()->scale())));
+
+  tflite::optimized_ops::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.h b/compiler/luci-interpreter/src/kernels/Relu6.h
new file mode 100644
index 000000000..f5030b588
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Relu6.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RELU6_H
+#define LUCI_INTERPRETER_KERNELS_RELU6_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Relu6 : public Kernel
+{
+public:
+  Relu6(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _output_multiplier{0};
+  int32_t _output_shift{0};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RELU6_H
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.test.cpp b/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
new file mode 100644
index 000000000..a7f104d85
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu6.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(Relu6Test, FloatSimple)
+{
+  std::vector<float> input_data{
+      0.0f, 1.0f,  3.0f,  // Row 1
+      7.0f, -1.0f, -2.0f, // Row 2
+  };
+
+  std::vector<float> ref_output_data{
+      0.0f, 1.0f, 3.0f, // Row 1
+      6.0f, 0.0f, 0.0f, // Row 2
+  };
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST(Relu6Test, Uint8Quantized)
+{
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float f_min = (-128.0 / 128.0) * 10;
+  const float f_max = (127.0 / 128.0) * 10;
+  const float tolerance = (f_max - f_min) / 255.0;
+
+  std::vector<float> input_data{
+      0,  -6, 2, 8, //
+      -2, 3,  7, 1, //
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
+                                                      quant_param.second, input_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({128, 128, 154, 205, 128, 166, 205, 141}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear({0, 0, 2, 6, 0, 3, 6, 1}, tolerance));
+}
+
+TEST(Relu6Test, Uint8Requantized)
+{
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float in_min = (-128.0 / 128.0) * 10;
+  const float in_max = (127.0 / 128.0) * 10;
+  const float out_min = (0.0 / 256.0) * 0;
+  const float out_max = (255.0 / 256.0) * 6;
+  const float tolerance = (in_max - in_min) / 255.0;
+
+  std::vector<float> input_data{
+      0,  -6, 2, 8, //
+      -2, 3,  7, 1, //
+  };
+
+  std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first,
+                                                      quant_input.second, input_data);
+
+  std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({0, 0, 87, 255, 0, 127, 255, 43}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear({0, 0, 2, 6, 0, 3, 6, 1}, tolerance));
+}
+
+TEST(Relu6Test, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(Relu6Test, Invalid_Input_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Reshape.test.cpp b/compiler/luci-interpreter/src/kernels/Reshape.test.cpp
index 7255b8132..38159380f 100644
--- a/compiler/luci-interpreter/src/kernels/Reshape.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Reshape.test.cpp
@@ -42,8 +42,7 @@ TEST(ReshapeTest, Regular)
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(input_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(input_data));
 }
 
 TEST(ReshapeTest, UnknownDimension)
@@ -60,8 +59,7 @@ TEST(ReshapeTest, UnknownDimension)
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(input_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(input_data));
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
new file mode 100644
index 000000000..9385855cf
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeBilinear.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ResizeBilinear::ResizeBilinear(const Tensor *input, const Tensor *size, Tensor *output,
+                               const ResizeBilinearParams &params)
+    : KernelWithParams<ResizeBilinearParams>({input, size}, {output}, params)
+{
+}
+
+void ResizeBilinear::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(size()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(size()->element_type() == DataType::S32);
+  if (params().half_pixel_centers && params().align_corners)
+    throw std::runtime_error("If half_pixel_centers is True, align_corners must be False.");
+  LUCI_INTERPRETER_CHECK(size()->shape().dim(0) == 2);
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = getTensorData<int32_t>(size())[0];
+  output_shape.dim(2) = getTensorData<int32_t>(size())[1];
+  output_shape.dim(3) = input()->shape().dim(3);
+  output()->resize(output_shape);
+}
+
+void ResizeBilinear::execute() const
+{
+  tflite::ResizeBilinearParams op_params{};
+  op_params.align_corners = params().align_corners;
+  op_params.half_pixel_centers = params().half_pixel_centers;
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::optimized_ops::ResizeBilinear(
+          op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(size()),
+          getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::ResizeBilinear(
+          op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+          getTensorShape(size()), getTensorData<int32_t>(size()), getTensorShape(output()),
+          getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.h b/compiler/luci-interpreter/src/kernels/ResizeBilinear.h
new file mode 100644
index 000000000..b7bdc2ab7
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_KERNELS_RESIZEBILINEAR_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ResizeBilinear : public KernelWithParams<ResizeBilinearParams>
+{
+public:
+  ResizeBilinear(const Tensor *input, const Tensor *shape, Tensor *output,
+                 const ResizeBilinearParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *size() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RESIZEBILINEAR_H
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
new file mode 100644
index 000000000..51c1359da
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeBilinear.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> size_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<float> input_data,
+           std::initializer_list<int32_t> size_data, std::initializer_list<float> output_data,
+           bool align_corners, bool half_pixel_centers)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> size_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<int32_t> size_data,
+                    std::initializer_list<float> output_data, bool align_corners,
+                    bool half_pixel_centers)
+{
+  // On TFlite example use Uint8 value it self, so this means quant param scale 1.0f and zero
+  // point 0.
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, 1.0, 0, input_data);
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1.0, 0);
+
+  ResizeBilinearParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+}
+
+template <typename T> class ResizeBilinearTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(ResizeBilinearTest, DataTypes);
+
+TYPED_TEST(ResizeBilinearTest, SimpleTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                       3, 6,  //
+                       9, 12, //
+                       4, 10, //
+                       10, 16 //
+                   },
+                   {3, 3},
+                   {
+                       3, 5, 6,    //
+                       7, 9, 10,   //
+                       9, 11, 12,  //
+                       4, 8, 10,   //
+                       8, 12, 14,  //
+                       10, 14, 16, //
+                   },
+                   false, false);
+  SUCCEED();
+}
+
+TEST(ResizeBilinearTest, HalfPixelCenterFloatTest)
+{
+  Check<float>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+               {
+                   1, 2, //
+                   3, 4, //
+                   1, 2, //
+                   3, 4  //
+               },
+               {3, 3},
+               {
+                   1, 1.5, 2, //
+                   2, 2.5, 3, //
+                   3, 3.5, 4, //
+                   1, 1.5, 2, //
+                   2, 2.5, 3, //
+                   3, 3.5, 4, //
+               },
+               false, true);
+  SUCCEED();
+}
+
+TEST(ResizeBilinearTest, HalfPixelCenterUint8Test)
+{
+  Check<uint8_t>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                 {
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     12, 16 //
+                 },
+                 {3, 3},
+                 {
+                     2, 4, 6,    //
+                     6, 7, 9,    //
+                     9, 10, 12,  //
+                     4, 7, 10,   //
+                     8, 10, 13,  //
+                     12, 14, 16, //
+                 },
+                 false, true);
+  SUCCEED();
+}
+
+TEST(ResizeBilinearTest, InputShapeInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2}, {
+                                                                          3, 6,  //
+                                                                          9, 12, //
+                                                                          4, 10, //
+                                                                          10, 16 //
+                                                                      });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeBilinearTest, SizeShapeInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
+                                                                             3, 6,  //
+                                                                             9, 12, //
+                                                                             4, 10, //
+                                                                             10, 16 //
+                                                                         });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeBilinearTest, SizeDimInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
+                                                                             3, 6,  //
+                                                                             9, 12, //
+                                                                             4, 10, //
+                                                                             10, 16 //
+                                                                         });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeBilinearTest, InvalidParams_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
+                                                                             3, 6,  //
+                                                                             9, 12, //
+                                                                             4, 10, //
+                                                                             10, 16 //
+                                                                         });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = true;
+  params.half_pixel_centers = true;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
new file mode 100644
index 000000000..e4ad8f742
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeNearestNeighbor.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ResizeNearestNeighbor::ResizeNearestNeighbor(const Tensor *input, const Tensor *size,
+                                             Tensor *output,
+                                             const ResizeNearestNeighborParams &params)
+    : KernelWithParams<ResizeNearestNeighborParams>({input, size}, {output}, params)
+{
+}
+
+void ResizeNearestNeighbor::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(size()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(size()->element_type() == DataType::S32);
+  LUCI_INTERPRETER_CHECK(size()->shape().dim(0) == 2);
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = getTensorData<int32_t>(size())[0];
+  output_shape.dim(2) = getTensorData<int32_t>(size())[1];
+  output_shape.dim(3) = input()->shape().dim(3);
+  output()->resize(output_shape);
+}
+
+void ResizeNearestNeighbor::execute() const
+{
+  tflite::ResizeNearestNeighborParams op_params{};
+  op_params.align_corners = params().align_corners;
+  op_params.half_pixel_centers = params().half_pixel_centers;
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::ResizeNearestNeighbor(
+          op_params, getTensorShape(input()), getTensorData<int32_t>(input()),
+          getTensorShape(size()), getTensorData<int32_t>(size()), getTensorShape(output()),
+          getTensorData<int32_t>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::ResizeNearestNeighbor(
+          op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+          getTensorShape(size()), getTensorData<int32_t>(size()), getTensorShape(output()),
+          getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.h b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.h
new file mode 100644
index 000000000..137d031cf
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_KERNELS_RESIZENEARESTNEIGHBOR_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ResizeNearestNeighbor : public KernelWithParams<ResizeNearestNeighborParams>
+{
+public:
+  ResizeNearestNeighbor(const Tensor *input, const Tensor *shape, Tensor *output,
+                        const ResizeNearestNeighborParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *size() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
new file mode 100644
index 000000000..9a804cca7
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeNearestNeighbor.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> size_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<float> input_data,
+           std::initializer_list<int32_t> size_data, std::initializer_list<float> output_data,
+           bool align_corners, bool half_pixel_centers)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> size_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<int32_t> size_data,
+                    std::initializer_list<float> output_data, bool align_corners,
+                    bool half_pixel_centers)
+{
+  std::pair<float, int32_t> quant_param =
+      quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
+                                  std::max(input_data) > 0 ? std::max(input_data) : 0.f);
+  Tensor input_tensor =
+      makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.first);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+}
+
+template <typename T> class ResizeNearestNeighborTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(ResizeNearestNeighborTest, DataTypes);
+
+TYPED_TEST(ResizeNearestNeighborTest, SimpleTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                       3, 6,  //
+                       9, 12, //
+                       4, 10, //
+                       10, 16 //
+                   },
+                   {3, 3},
+                   {
+                       3, 3, 6,    //
+                       3, 3, 6,    //
+                       9, 9, 12,   //
+                       4, 4, 10,   //
+                       4, 4, 10,   //
+                       10, 10, 16, //
+                   },
+                   false, false);
+}
+
+TYPED_TEST(ResizeNearestNeighborTest, AlignCenterTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                       3, 6,  //
+                       9, 12, //
+                       4, 10, //
+                       10, 16 //
+                   },
+                   {3, 3},
+                   {
+                       3, 6, 6,    //
+                       9, 12, 12,  //
+                       9, 12, 12,  //
+                       4, 10, 10,  //
+                       10, 16, 16, //
+                       10, 16, 16, //
+                   },
+                   true, false);
+}
+
+TYPED_TEST(ResizeNearestNeighborTest, HalfPixelCenterTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                       3, 6,  //
+                       9, 12, //
+                       4, 10, //
+                       10, 16 //
+                   },
+                   {3, 3},
+                   {
+                       3, 6, 6,    //
+                       9, 12, 12,  //
+                       9, 12, 12,  //
+                       4, 10, 10,  //
+                       10, 16, 16, //
+                       10, 16, 16, //
+                   },
+                   false, true);
+}
+
+TEST(ResizeNearestNeighborTest, InputShapeInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2}, {
+                                                                          3, 6,  //
+                                                                          9, 12, //
+                                                                          4, 10, //
+                                                                          10, 16 //
+                                                                      });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeNearestNeighborTest, SizeShapeInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
+                                                                             3, 6,  //
+                                                                             9, 12, //
+                                                                             4, 10, //
+                                                                             10, 16 //
+                                                                         });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeNearestNeighborTest, SizeDimInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
+                                                                             3, 6,  //
+                                                                             9, 12, //
+                                                                             4, 10, //
+                                                                             10, 16 //
+                                                                         });
+  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp b/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
index 69b55d2f2..d33b800be 100644
--- a/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
@@ -29,17 +29,14 @@ using namespace testing;
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor{DataType::FLOAT32, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(float));
-
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Rsqrt kernel(&input_tensor, &output_tensor);
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ::testing::ElementsAreArray(ArrayFloatNear(output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.cpp b/compiler/luci-interpreter/src/kernels/Softmax.cpp
index 2fb7f3f2c..642c0ad75 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.cpp
+++ b/compiler/luci-interpreter/src/kernels/Softmax.cpp
@@ -19,6 +19,7 @@
 #include "kernels/Utils.h"
 
 #include <tensorflow/lite/kernels/internal/reference/softmax.h>
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
 
 #include <stdexcept>
 
@@ -35,7 +36,15 @@ Softmax::Softmax(const Tensor *input, Tensor *output, const SoftmaxParams &param
 
 void Softmax::configure()
 {
-  assert(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= 1);
+  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == 0);
+    tflite::SoftmaxParams op_params{};
+    op_params.table = _table;
+    tflite::optimized_ops::PopulateSoftmaxLookupTable(&op_params, input()->scale(), params().beta);
+  }
   output()->resize(input()->shape());
 }
 
@@ -46,6 +55,12 @@ void Softmax::execute() const
     case DataType::FLOAT32:
       evalFloat();
       break;
+    case DataType::S8:
+      evalQuantized<int8_t>();
+      break;
+    case DataType::U8:
+      evalQuantized<uint8_t>();
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
@@ -53,12 +68,23 @@ void Softmax::execute() const
 
 void Softmax::evalFloat() const
 {
-  tflite::SoftmaxParams params{};
-  params.beta = _params.beta;
+  tflite::SoftmaxParams op_params{};
+  op_params.beta = params().beta;
 
-  tflite::reference_ops::Softmax(params, getTensorShape(input()), getTensorData<float>(input()),
+  tflite::reference_ops::Softmax(op_params, getTensorShape(input()), getTensorData<float>(input()),
                                  getTensorShape(output()), getTensorData<float>(output()));
 }
 
+template <typename T> void Softmax::evalQuantized() const
+{
+  tflite::SoftmaxParams op_params{};
+  op_params.table = const_cast<float *>(_table);
+  op_params.zero_point = output()->zero_point();
+  op_params.scale = output()->scale();
+
+  tflite::optimized_ops::Softmax(op_params, getTensorShape(input()), getTensorData<T>(input()),
+                                 getTensorShape(output()), getTensorData<T>(output()));
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.h b/compiler/luci-interpreter/src/kernels/Softmax.h
index 2e4eda492..1f281df1c 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.h
+++ b/compiler/luci-interpreter/src/kernels/Softmax.h
@@ -38,6 +38,9 @@ public:
 
 private:
   void evalFloat() const;
+  template <typename T> void evalQuantized() const;
+
+  float _table[256];
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
index 2193c3e83..d3d8209a5 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
@@ -26,15 +26,10 @@ namespace
 
 using namespace testing;
 
-TEST(SoftmaxTest, Float)
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Shape input_shape{2, 1, 2, 3};
-  std::vector<float> input_data{
-      5,  -9, 8,  //
-      -7, 2,  -4, //
-      1,  -2, 9,  //
-      3,  -6, -1, //
-  };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
@@ -45,14 +40,61 @@ TEST(SoftmaxTest, Float)
   kernel.configure();
   kernel.execute();
 
-  std::vector<float> ref_output_data{
-      0.38514, 0.09497, 0.51989, //
-      0.20792, 0.51141, 0.28067, //
-      0.25212, 0.18678, 0.56110, //
-      0.48149, 0.19576, 0.32275, //
-  };
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data)
+{
+  std::pair<float, int32_t> input_quant_param =
+      quantizationParams<uint8_t>(std::min<float>(std::min<float>(input_data), 0.f),
+                                  std::max<float>(std::max<float>(input_data), 0.f));
+  std::pair<float, int32_t> output_quant_param =
+      quantizationParams<uint8_t>(std::min<float>(std::min<float>(output_data), 0.f),
+                                  std::max<float>(std::max<float>(output_data), 0.f));
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor output_tensor =
+      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  SoftmaxParams params{};
+  params.beta = 0.1;
+
+  Softmax kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+}
+
+template <typename T> class SoftmaxTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(SoftmaxTest, DataTypes);
+
+TYPED_TEST(SoftmaxTest, Simple)
+{
+  Check<TypeParam>({2, 1, 2, 3}, {2, 1, 2, 3},
+                   {
+                       5, -9, 8,  //
+                       -7, 2, -4, //
+                       1, -2, 9,  //
+                       3, -6, -1, //
+                   },
+                   {
+                       0.38514, 0.09497, 0.51989, //
+                       0.20792, 0.51141, 0.28067, //
+                       0.25212, 0.18678, 0.56110, //
+                       0.48149, 0.19576, 0.32275, //
+                   });
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp b/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
index e4a0fd642..77b6655dc 100644
--- a/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
@@ -35,13 +35,13 @@ TYPED_TEST_CASE(SpaceToDepthTest, DataTypes);
 
 TYPED_TEST(SpaceToDepthTest, SimpleCase)
 {
+  constexpr DataType element_type = getElementType<TypeParam>();
   std::vector<TypeParam> input_data{1, 5, 6, 7, 2, 3, 4, 8};
   Shape input_shape{1, 2, 2, 2};
-  Tensor input_tensor{getElementType<TypeParam>(), input_shape, {{}, {}}, ""};
-  input_tensor.writeData(input_data.data(), input_data.size() * sizeof(TypeParam));
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
   std::vector<TypeParam> output_data{1, 5, 6, 7, 2, 3, 4, 8};
   std::vector<int32_t> output_shape{1, 1, 1, 8};
-  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+  Tensor output_tensor = makeOutputTensor(element_type);
 
   SpaceToDepthParams params{};
   params.block_size = 2;
diff --git a/compiler/luci-interpreter/src/kernels/Split.test.cpp b/compiler/luci-interpreter/src/kernels/Split.test.cpp
index 11d0b1ea9..2147d15c1 100644
--- a/compiler/luci-interpreter/src/kernels/Split.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Split.test.cpp
@@ -30,11 +30,11 @@ using namespace testing;
 template <typename T>
 void Check(int axis, int num_splits, std::initializer_list<int32_t> input_shape,
            std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
-           std::vector<std::vector<T>> output_data, DataType element_type)
+           std::vector<std::vector<T>> output_data)
 {
+  constexpr DataType element_type = getElementType<T>();
   Tensor axis_tensor = makeInputTensor<DataType::S32>({}, {axis});
-  Tensor input_tensor{element_type, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(T));
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
 
   std::vector<Tensor> output_tensors;
   output_tensors.reserve(num_splits);
@@ -74,51 +74,42 @@ TYPED_TEST(SplitTest, FourDimensional)
                    {
                        {1, 2, 3, 4, 5, 6, 7, 8},        //
                        {9, 10, 11, 12, 13, 14, 15, 16}, //
-                   },
-                   getElementType<TypeParam>());
+                   });
   Check<TypeParam>(
       /*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      {
-          {1, 2, 3, 4, 9, 10, 11, 12},  //
-          {5, 6, 7, 8, 13, 14, 15, 16}, //
-      },
-      getElementType<TypeParam>());
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
+                                                                   {1, 2, 3, 4, 9, 10, 11, 12},  //
+                                                                   {5, 6, 7, 8, 13, 14, 15, 16}, //
+                                                               });
   Check<TypeParam>(
       /*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      {
-          {1, 2, 5, 6, 9, 10, 13, 14},  //
-          {3, 4, 7, 8, 11, 12, 15, 16}, //
-      },
-      getElementType<TypeParam>());
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
+                                                                   {1, 2, 5, 6, 9, 10, 13, 14},  //
+                                                                   {3, 4, 7, 8, 11, 12, 15, 16}, //
+                                                               });
   Check<TypeParam>(
       /*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      {
-          {1, 3, 5, 7, 9, 11, 13, 15},  //
-          {2, 4, 6, 8, 10, 12, 14, 16}, //
-      },
-      getElementType<TypeParam>());
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
+                                                                   {1, 3, 5, 7, 9, 11, 13, 15},  //
+                                                                   {2, 4, 6, 8, 10, 12, 14, 16}, //
+                                                               });
 }
 
 TYPED_TEST(SplitTest, OneDimensional)
 {
   Check<TypeParam>(
       /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
-      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}}, getElementType<TypeParam>());
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
 TYPED_TEST(SplitTest, NegativeAxis)
 {
   Check<TypeParam>(
       /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      {
-          {1, 2, 3, 4, 5, 6, 7, 8}, //
-          {9, 10, 11, 12, 13, 14, 15, 16},
-      },
-      getElementType<TypeParam>());
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
+                                                                   {1, 2, 3, 4, 5, 6, 7, 8}, //
+                                                                   {9, 10, 11, 12, 13, 14, 15, 16},
+                                                               });
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp b/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
index cdd208280..504db4493 100644
--- a/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
@@ -29,17 +29,14 @@ using namespace testing;
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor{DataType::FLOAT32, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(float));
-
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Sqrt kernel(&input_tensor, &output_tensor);
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ::testing::ElementsAreArray(ArrayFloatNear(output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
index 3a34284dd..ff9fb09d2 100644
--- a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
@@ -29,17 +29,14 @@ using namespace testing;
 template <typename T>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<T> input_data, std::initializer_list<T> output_data,
-           DataType element_type, std::vector<int32_t> squeeze_dims)
+           std::initializer_list<int32_t> squeeze_dims)
 {
-  Tensor input_tensor{element_type, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(T));
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(element_type);
 
   SqueezeParams params{};
-  for (size_t i = 0; i < squeeze_dims.size(); i++)
-  {
-    params.squeeze_dims.push_back(squeeze_dims.at(i));
-  }
+  params.squeeze_dims = squeeze_dims;
 
   Squeeze kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
@@ -64,7 +61,7 @@ TYPED_TEST(SqueezeTest, TotalTest)
                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
       /*output_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
                        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
-      getElementType<TypeParam>(), {-1, 0});
+      {-1, 0});
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp b/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp
index 5ab06e2ec..66dffcaf2 100644
--- a/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp
@@ -36,17 +36,12 @@ TEST(StridedSliceTest, Float)
   std::vector<int32_t> end_data{1, 3, 2};
   Shape strides_shape{3};
   std::vector<int32_t> strides_data{1, 1, 1};
-  Tensor input_tensor{DataType::FLOAT32, input_shape, {}, ""};
-  Tensor begin_tensor{DataType::S32, begin_shape, {}, ""};
-  Tensor end_tensor{DataType::S32, end_shape, {}, ""};
-  Tensor strides_tensor{DataType::S32, strides_shape, {}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
+  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data);
+  Tensor strides_tensor = makeInputTensor<DataType::S32>(strides_shape, strides_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
-  input_tensor.writeData(input_data.data(), input_data.size() * sizeof(float));
-  begin_tensor.writeData(begin_data.data(), begin_data.size() * sizeof(int32_t));
-  end_tensor.writeData(end_data.data(), end_data.size() * sizeof(int32_t));
-  strides_tensor.writeData(strides_data.data(), strides_data.size() * sizeof(int32_t));
-
   StridedSliceParams params{};
   params.begin_mask = 0;
   params.end_mask = 0;
@@ -61,8 +56,7 @@ TEST(StridedSliceTest, Float)
 
   std::vector<int32_t> output_shape{3, 2};
   std::vector<float> output_data{1, 2, 3, 4, 5, 6};
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
@@ -70,24 +64,18 @@ TEST(StridedSliceTest, Uint8)
 {
   Shape input_shape{2, 3, 2};
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-  std::vector<uint8_t> quant_input_data = quantize<uint8_t>(input_data, 1.0f, 0);
   Shape begin_shape{3};
   std::vector<int32_t> begin_data{0, 0, 0};
   Shape end_shape{3};
   std::vector<int32_t> end_data{1, 3, 2};
   Shape strides_shape{3};
   std::vector<int32_t> strides_data{1, 1, 1};
-  Tensor input_tensor{DataType::U8, input_shape, {{1.0f}, {0}}, ""};
-  Tensor begin_tensor{DataType::S32, begin_shape, {}, ""};
-  Tensor end_tensor{DataType::S32, end_shape, {}, ""};
-  Tensor strides_tensor{DataType::S32, strides_shape, {}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, 1.0f, 0, input_data);
+  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
+  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data);
+  Tensor strides_tensor = makeInputTensor<DataType::S32>(strides_shape, strides_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1.0f, 0);
 
-  input_tensor.writeData(quant_input_data.data(), quant_input_data.size() * sizeof(uint8_t));
-  begin_tensor.writeData(begin_data.data(), begin_data.size() * sizeof(int32_t));
-  end_tensor.writeData(end_data.data(), end_data.size() * sizeof(int32_t));
-  strides_tensor.writeData(strides_data.data(), strides_data.size() * sizeof(int32_t));
-
   StridedSliceParams params{};
   params.begin_mask = 0;
   params.end_mask = 0;
@@ -102,9 +90,7 @@ TEST(StridedSliceTest, Uint8)
 
   std::vector<int32_t> output_shape{3, 2};
   std::vector<float> output_data{1, 2, 3, 4, 5, 6};
-  EXPECT_THAT(dequantize(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                         output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(output_data)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Sub.cpp b/compiler/luci-interpreter/src/kernels/Sub.cpp
new file mode 100644
index 000000000..dd9c1102f
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Sub.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Sub.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Sub::Sub(const Tensor *input1, const Tensor *input2, Tensor *output, const SubParams &params)
+    : KernelWithParams<SubParams>({input1, input2}, {output}, params)
+{
+}
+
+void Sub::configure()
+{
+  LUCI_INTERPRETER_CHECK(!(input1()->element_type() != input2()->element_type()))
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Sub::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Sub::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+      getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastSubSlow(
+        params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+        getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    tflite::optimized_ops::Sub(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                               getTensorShape(input2()), getTensorData<float>(input2()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+  }
+}
+
+void Sub::evalQuantized() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const int left_shift = 20;
+  const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
+  const double real_input1_multiplier = input1_scale / twice_max_input_scale;
+  const double real_input2_multiplier = input2_scale / twice_max_input_scale;
+  const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
+
+  int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
+  int input1_shift{}, input2_shift{}, output_shift{};
+  quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
+  quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
+  quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+  params.left_shift = left_shift;
+  // The kernel expects inputs' zero points to be negated.
+  params.input1_offset = -input1()->zero_point(); // Note the '-'.
+  params.input1_multiplier = input1_multiplier;
+  params.input1_shift = input1_shift;
+  params.input2_offset = -input2()->zero_point(); // Note the '-'.
+  params.input2_multiplier = input2_multiplier;
+  params.input2_shift = input2_shift;
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+      getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastSubSlow(
+        params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+        getTensorShape(input2()), getTensorData<uint8_t>(input2()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Sub(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+                               getTensorShape(input2()), getTensorData<uint8_t>(input2()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Sub.h b/compiler/luci-interpreter/src/kernels/Sub.h
new file mode 100644
index 000000000..d7940b5c6
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Sub.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SUB_H
+#define LUCI_INTERPRETER_KERNELS_SUB_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Sub : public KernelWithParams<SubParams>
+{
+public:
+  Sub(const Tensor *input1, const Tensor *input2, Tensor *output, const SubParams &params);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SUB_H
diff --git a/compiler/luci-interpreter/src/kernels/Sub.test.cpp b/compiler/luci-interpreter/src/kernels/Sub.test.cpp
new file mode 100644
index 000000000..9f77fe7e0
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Sub.test.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Sub.h"
+#include "kernels/TestUtils.h"
+
+#include <algorithm>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+using std::pair;
+using std::vector;
+using std::transform;
+using std::initializer_list;
+
+// for quantized Add, the error shouldn't exceed step
+float GetTolerance(float min, float max)
+{
+  float kQuantizedStep = (max - min) / 255.0;
+  return kQuantizedStep;
+}
+
+TEST(SubTest, Uint8)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  vector<float> base_data = {-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                             1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  vector<Shape> test_shapes = {{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  vector<float> test_data = {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  vector<vector<int32_t>> output_shapes = {{2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+  vector<vector<float>> output_data = {
+      {-0.5f, 2.0f,  0.1f,  1.8f,  -1.3f, 1.4f,  0.7f, 0.2f,  1.3f, 0.0f,  -0.1f, -0.4f,
+       0.6f,  -1.4f, 1.2f,  -1.6f, -0.2f, -2.0f, 1.0f, 2.5f,  1.6f, 2.3f,  0.2f,  1.9f,
+       -1.8f, -0.3f, -1.2f, -0.5f, -2.6f, -0.9f, 0.5f, -2.5f, 1.1f, -2.7f, -0.3f, -3.0f},
+      {-0.5f, 2.0f, 1.3f, 0.0f, -0.2f, -2.0f, 1.0f, 2.5f, -1.2f, -0.5f, -0.3f, -3.0f},
+      {-0.5f, 2.1f,  -0.6f, 2.0f,  0.1f,  2.7f,  0.7f, 0.3f,  0.6f,  0.2f,  1.3f,  0.9f,
+       0.6f,  -1.3f, 0.5f,  -1.4f, 1.2f,  -0.7f, 0.7f, 2.3f,  0.2f,  1.8f,  0.3f,  1.9f,
+       -2.1f, -0.5f, -2.6f, -1.0f, -2.5f, -0.9f, 0.2f, -2.7f, -0.3f, -3.0f, -0.2f, -3.0f},
+      {-0.5f, 2.1f, 0.6f, 0.2f, 1.2f, -0.7f, 0.7f, 2.3f, -2.6f, -1.0f, -0.2f, -3.0f}};
+
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
+  for (size_t i = 0; i < output_data.size(); ++i)
+  {
+    Tensor input1_tensor =
+        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
+                                                         quant_param.second, test_data);
+    Tensor output_tensor =
+        makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+    SubParams params{};
+    params.activation = Activation::NONE;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    kernel.execute();
+
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+
+  // Inversion step for output_data, because subtract is not commutative operation
+  auto multiply = [](auto &i) {
+    transform(i.begin(), i.end(), i.begin(), [](auto &value) { return value * -1.0f; });
+  };
+  for_each(output_data.begin(), output_data.end(), multiply);
+
+  // Re-run with exchanged inputs.
+  for (size_t i = 0; i < output_data.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
+                                                         quant_param.second, test_data);
+    Tensor input2_tensor =
+        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+    Tensor output_tensor =
+        makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+    SubParams params{};
+    params.activation = Activation::NONE;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    kernel.execute();
+
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST(SubTest, Float)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  vector<vector<int32_t>> output_shapes{{2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+  vector<vector<float>> test_outputs = {
+      {0.0f, 2.0f, 0.1f, 1.8f, 0.0f, 1.4f, 0.7f, 0.2f, 1.3f, 0.0f, 0.0f, 0.0f,
+       0.6f, 0.0f, 1.2f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 1.6f, 2.3f, 0.2f, 1.9f,
+       0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 0.0f, 1.1f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 2.0f, 1.3f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 2.1f, 0.0f, 2.0f, 0.1f, 2.7f, 0.7f, 0.3f, 0.6f, 0.2f, 1.3f, 0.9f,
+       0.6f, 0.0f, 0.5f, 0.0f, 1.2f, 0.0f, 0.7f, 2.3f, 0.2f, 1.8f, 0.3f, 1.9f,
+       0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.2f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 2.1f, 0.6f, 0.2f, 1.2f, 0.0f, 0.7f, 2.3f, 0.0f, 0.0f, 0.0f, 0.0f}};
+
+  vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                            1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
+    Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data);
+    Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+    SubParams params{};
+    params.activation = Activation::RELU;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+        << "With shape number " << i;
+
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST(SubTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SubParams params{};
+  params.activation = Activation::RELU;
+
+  Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(SubTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2});
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  SubParams params{};
+  params.activation = Activation::RELU;
+
+  Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Tanh.test.cpp b/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
index 392b8672d..f91ffa1db 100644
--- a/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
@@ -45,8 +45,7 @@ TEST(TanhTest, Float)
       0,          -0.9999877, 0.9640275, 0.999329,  //
       0.99505475, -0.9640275, 1,         0.7615941, //
   };
-  EXPECT_THAT(extractTensorData<float>(output_tensor),
-              ElementsAreArray(ArrayFloatNear(ref_output_data)));
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
 
 TEST(TanhTest, Uint8)
@@ -70,13 +69,10 @@ TEST(TanhTest, Uint8)
       0,  -6, 2, 4, //
       -4, -2, 8, 1, //
   };
-  Tensor input_tensor{
-      DataType::U8, {2, 6, 4, 1}, {{input_quant_param.first}, {input_quant_param.second}}, ""};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 6, 4, 1}, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
   Tensor output_tensor =
       makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
-  std::vector<uint8_t> quantize_input =
-      quantize<uint8_t>(input_data, input_quant_param.first, input_quant_param.second);
-  input_tensor.writeData(quantize_input.data(), quantize_input.size() * sizeof(uint8_t));
 
   Tanh kernel(&input_tensor, &output_tensor);
   kernel.configure();
@@ -97,9 +93,7 @@ TEST(TanhTest, Uint8)
       -0.999329, -0.96402,  0.99999,  0.76159,  //
   };
   std::vector<int32_t> ref_output_shape{2, 6, 4, 1};
-  EXPECT_THAT(dequantize<uint8_t>(extractTensorData<uint8_t>(output_tensor), output_tensor.scale(),
-                                  output_tensor.zero_point()),
-              ElementsAreArray(ArrayFloatNear(ref_output_data, kTanhTolerance)));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data, kTanhTolerance));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/TestUtils.cpp b/compiler/luci-interpreter/src/kernels/TestUtils.cpp
index 2c8a6ae78..4c19c8810 100644
--- a/compiler/luci-interpreter/src/kernels/TestUtils.cpp
+++ b/compiler/luci-interpreter/src/kernels/TestUtils.cpp
@@ -17,6 +17,8 @@
 
 #include "kernels/TestUtils.h"
 
+#include <stdexcept>
+
 namespace luci_interpreter
 {
 namespace kernels
@@ -34,7 +36,25 @@ Tensor makeOutputTensor(DataType element_type, float scale, int32_t zero_point)
   return Tensor(element_type, {}, {{scale}, {zero_point}}, "");
 }
 
-std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float> &values, float max_abs_error)
+std::vector<float> dequantizeTensorData(const Tensor &tensor)
+{
+  if (tensor.element_type() == DataType::U8)
+  {
+    return dequantize(extractTensorData<uint8_t>(tensor), tensor.scale(), tensor.zero_point());
+  }
+  else if (tensor.element_type() == DataType::S16)
+  {
+    // S16 quantization is symmetric, so zero point should be zero.
+    assert(tensor.zero_point() == 0);
+    return dequantize(extractTensorData<int16_t>(tensor), tensor.scale(), 0);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+}
+
+Matcher<std::vector<float>> FloatArrayNear(const std::vector<float> &values, float max_abs_error)
 {
   std::vector<Matcher<float>> matchers;
   matchers.reserve(values.size());
@@ -42,7 +62,7 @@ std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float> &values, flo
   {
     matchers.emplace_back(FloatNear(v, max_abs_error));
   }
-  return matchers;
+  return ElementsAreArray(matchers);
 }
 
 std::vector<int32_t> extractTensorShape(const Tensor &tensor)
diff --git a/compiler/luci-interpreter/src/kernels/TestUtils.h b/compiler/luci-interpreter/src/kernels/TestUtils.h
index 5311a1949..e5bd6a2db 100644
--- a/compiler/luci-interpreter/src/kernels/TestUtils.h
+++ b/compiler/luci-interpreter/src/kernels/TestUtils.h
@@ -32,6 +32,9 @@ namespace kernels
 namespace testing
 {
 
+template <typename T>
+std::vector<T> quantize(const std::vector<float> &data, float scale, int32_t zero_point);
+
 template <DataType DT>
 Tensor makeInputTensor(const Shape &shape, const std::vector<typename DataTypeImpl<DT>::Type> &data)
 {
@@ -40,6 +43,17 @@ Tensor makeInputTensor(const Shape &shape, const std::vector<typename DataTypeIm
   return tensor;
 }
 
+template <DataType DT>
+Tensor makeInputTensor(const Shape &shape, float scale, int32_t zero_point,
+                       const std::vector<float> &data)
+{
+  using NativeT = typename DataTypeImpl<DT>::Type;
+  Tensor tensor(DT, shape, {{scale}, {zero_point}}, "");
+  std::vector<NativeT> quantized_data = quantize<NativeT>(data, scale, zero_point);
+  tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(NativeT));
+  return tensor;
+}
+
 Tensor makeOutputTensor(DataType element_type);
 Tensor makeOutputTensor(DataType element_type, float scale, int32_t zero_point);
 
@@ -65,27 +79,44 @@ template <typename T> std::vector<T> extractTensorData(const Tensor &tensor)
   return std::vector<T>(data_ptr, data_ptr + tensor.shape().num_elements());
 }
 
-std::vector<::testing::Matcher<float>> ArrayFloatNear(const std::vector<float> &values,
+std::vector<float> dequantizeTensorData(const Tensor &tensor);
+
+// Array version of `::testing::FloatNear` matcher.
+::testing::Matcher<std::vector<float>> FloatArrayNear(const std::vector<float> &values,
                                                       float max_abs_error = 1.0e-5f);
 
 template <typename T>
-inline std::vector<T> quantize(const std::vector<float> &data, float scale, int32_t zero_point)
+std::vector<T> quantize(const std::vector<float> &data, float scale, int32_t zero_point)
 {
-  assert(!std::is_floating_point<T>::value);
+  static_assert(std::is_integral<T>::value, "Integral type expected.");
+
+  float q_min{}, q_max{};
+  if (std::is_signed<T>::value)
+  {
+    // For now, assume that signed type implies signed symmetric quantization.
+    assert(zero_point == 0);
+    q_min = -std::numeric_limits<T>::max();
+    q_max = std::numeric_limits<T>::max();
+  }
+  else
+  {
+    q_min = 0;
+    q_max = std::numeric_limits<T>::max();
+  }
+
   std::vector<T> q;
   for (const auto &f : data)
   {
-    q.push_back(static_cast<T>(std::max<float>(
-        std::numeric_limits<T>::lowest(),
-        std::min<float>(std::numeric_limits<T>::max(), std::round(zero_point + (f / scale))))));
+    q.push_back(static_cast<T>(
+        std::max<float>(q_min, std::min<float>(q_max, std::round(zero_point + (f / scale))))));
   }
   return q;
 }
 
 template <typename T>
-inline std::vector<float> dequantize(const std::vector<T> &data, float scale, int32_t zero_point)
+std::vector<float> dequantize(const std::vector<T> &data, float scale, int32_t zero_point)
 {
-  assert(!std::is_floating_point<T>::value);
+  static_assert(std::is_integral<T>::value, "Integral type expected.");
   std::vector<float> f;
   for (const T &q : data)
   {
@@ -94,18 +125,16 @@ inline std::vector<float> dequantize(const std::vector<T> &data, float scale, in
   return f;
 }
 
+// NOTE Returns scale and zero point for _asymmetric_ range (both signed and unsigned).
 template <typename T> std::pair<float, int32_t> quantizationParams(float f_min, float f_max)
 {
-  if (std::is_floating_point<T>::value)
-  {
-    return {1.0f, 0};
-  }
+  static_assert(std::is_integral<T>::value, "Integral type expected.");
   int32_t zero_point = 0;
-  double scale = 0;
+  float scale = 0;
   const T qmin = std::numeric_limits<T>::lowest();
   const T qmax = std::numeric_limits<T>::max();
-  const double qmin_double = qmin;
-  const double qmax_double = qmax;
+  const float qmin_double = qmin;
+  const float qmax_double = qmax;
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
   assert(f_max >= 0);
@@ -131,16 +160,16 @@ template <typename T> std::pair<float, int32_t> quantizationParams(float f_min,
   // The arithmetic error on the zero point computed from either pair
   // will be roughly machine_epsilon * (sum of absolute values of terms)
   // so we want to use the variant that adds the smaller terms.
-  const double zero_point_from_min = qmin_double - f_min / scale;
-  const double zero_point_from_max = qmax_double - f_max / scale;
+  const float zero_point_from_min = qmin_double - f_min / scale;
+  const float zero_point_from_max = qmax_double - f_max / scale;
 
-  const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(f_min / scale);
+  const float zero_point_from_min_error = std::abs(qmin_double) + std::abs(f_min / scale);
 
-  const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(f_max / scale);
+  const float zero_point_from_max_error = std::abs(qmax_double) + std::abs(f_max / scale);
 
-  const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error
-                                       ? zero_point_from_min
-                                       : zero_point_from_max;
+  const float zero_point_double = zero_point_from_min_error < zero_point_from_max_error
+                                      ? zero_point_from_min
+                                      : zero_point_from_max;
 
   // Now we need to nudge the zero point to be an integer
   // (our zero points are integer, and this is motivated by the requirement
@@ -168,7 +197,7 @@ template <typename T> std::pair<float, int32_t> quantizationParams(float f_min,
   assert(qmin <= nudged_zero_point);
   zero_point = nudged_zero_point;
   // finally, return the values
-  return {static_cast<float>(scale), zero_point};
+  return {scale, zero_point};
 }
 
 inline float getTolerance(float min, float max, int quantize_steps)
diff --git a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
index 87e6e2a00..1c99223a8 100644
--- a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
@@ -29,14 +29,11 @@ using namespace testing;
 template <typename T>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> perm_shape,
            std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
-           std::initializer_list<int32_t> perm_data, std::initializer_list<T> output_data,
-           DataType element_type)
+           std::initializer_list<int32_t> perm_data, std::initializer_list<T> output_data)
 {
-  Tensor input_tensor{element_type, input_shape, {}, ""};
-  input_tensor.writeData(input_data.begin(), input_data.size() * sizeof(T));
-
-  Tensor perm_tensor{DataType::S32, perm_shape, {}, ""};
-  perm_tensor.writeData(perm_data.begin(), perm_data.size() * sizeof(int32_t));
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor perm_tensor = makeInputTensor<DataType::S32>(perm_shape, perm_data);
   Tensor output_tensor = makeOutputTensor(element_type);
 
   Transpose kernel(&input_tensor, &perm_tensor, &output_tensor);
@@ -60,8 +57,7 @@ TYPED_TEST(TransposeTest, Small3D)
                                    12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
                    /*perm_data=*/{2, 0, 1},
                    /*output_data=*/{0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
-                                    2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23},
-                   getElementType<TypeParam>());
+                                    2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23});
 }
 
 TYPED_TEST(TransposeTest, Large4D)
@@ -84,8 +80,7 @@ TYPED_TEST(TransposeTest, Large4D)
                        10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
                        70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
                        15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
-                       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119},
-      getElementType<TypeParam>());
+                       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
 }
 
 TYPED_TEST(TransposeTest, Large2D)
@@ -101,15 +96,13 @@ TYPED_TEST(TransposeTest, Large2D)
                       90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
                       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
       /*perm_data=*/{1, 0},
-      /*output_data=*/{0,  12, 24, 36,  48,  60, 72, 84, 96,  108, 1,  13, 25, 37,  49,
-                       61, 73, 85, 97,  109, 2,  14, 26, 38,  50,  62, 74, 86, 98,  110,
-                       3,  15, 27, 39,  51,  63, 75, 87, 99,  111, 4,  16, 28, 40,  52,
-                       64, 76, 88, 100, 112, 5,  17, 29, 41,  53,  65, 77, 89, 101, 113,
-                       6,  18, 30, 42,  54,  66, 78, 90, 102, 114, 7,  19, 31, 43,  55,
-                       67, 79, 91, 103, 115, 8,  20, 32, 44,  56,  68, 80, 92, 104, 116,
-                       9,  21, 33, 45,  57,  69, 81, 93, 105, 117, 10, 22, 34, 46,  58,
-                       70, 82, 94, 106, 118, 11, 23, 35, 47,  59,  71, 83, 95, 107, 119},
-      getElementType<TypeParam>());
+      /*output_data=*/{
+          0,  12, 24, 36, 48, 60, 72, 84, 96,  108, 1,  13, 25, 37, 49, 61, 73, 85, 97,  109,
+          2,  14, 26, 38, 50, 62, 74, 86, 98,  110, 3,  15, 27, 39, 51, 63, 75, 87, 99,  111,
+          4,  16, 28, 40, 52, 64, 76, 88, 100, 112, 5,  17, 29, 41, 53, 65, 77, 89, 101, 113,
+          6,  18, 30, 42, 54, 66, 78, 90, 102, 114, 7,  19, 31, 43, 55, 67, 79, 91, 103, 115,
+          8,  20, 32, 44, 56, 68, 80, 92, 104, 116, 9,  21, 33, 45, 57, 69, 81, 93, 105, 117,
+          10, 22, 34, 46, 58, 70, 82, 94, 106, 118, 11, 23, 35, 47, 59, 71, 83, 95, 107, 119});
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
index 898bae3da..07d92f07f 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
@@ -43,18 +43,6 @@ void TransposeConv::configure()
   assert(input()->element_type() == DataType::FLOAT32 || input()->element_type() == DataType::U8);
   assert(input()->element_type() == output()->element_type());
   assert(input()->shape().dim(3) == filter()->shape().dim(3));
-  if (input()->element_type() == DataType::U8)
-  {
-    _scratch_tensor =
-        std::make_unique<Tensor>(DataType::S32, output()->shape(), AffineQuantization{}, "");
-    double real_multiplier = 0.0;
-    const double input_product_scale = input()->scale() * filter()->scale();
-    assert(input_product_scale >= 0);
-    real_multiplier = input_product_scale / output()->scale();
-    int exponent;
-    quantizeMultiplier(real_multiplier, &_output_multiplier, &exponent);
-    _output_shift = -exponent;
-  }
 
   const int num_dims = output_shape()->shape().dim(0);
   Shape out_shape(num_dims);
@@ -62,6 +50,31 @@ void TransposeConv::configure()
   for (int i = 0; i < num_dims; i++)
     out_shape.dim(i) = shape_data[i];
   output()->resize(out_shape);
+
+  const int32_t filter_height = filter()->shape().dim(1);
+  const int32_t filter_width = filter()->shape().dim(2);
+  const int32_t output_height = out_shape.dim(1);
+  const int32_t output_width = out_shape.dim(2);
+
+  const int32_t unused_output_height =
+      computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
+  const int32_t unused_output_width =
+      computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
+
+  _padding_height =
+      computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
+  _padding_width =
+      computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
+
+  if (input()->element_type() == DataType::U8)
+  {
+    _scratch_tensor =
+        std::make_unique<Tensor>(DataType::S32, output()->shape(), AffineQuantization{}, "");
+    const double input_product_scale = input()->scale() * filter()->scale();
+    assert(input_product_scale >= 0);
+    const double real_multiplier = input_product_scale / output()->scale();
+    quantizeMultiplier(real_multiplier, &_output_multiplier, &_output_shift);
+  }
 }
 
 void TransposeConv::execute() const
@@ -81,74 +94,45 @@ void TransposeConv::execute() const
 
 void TransposeConv::evalFloat() const
 {
-  const int width = output()->shape().dim(2);
-  const int height = output()->shape().dim(1);
-
-  const int filter_width = filter()->shape().dim(2);
-  const int filter_height = filter()->shape().dim(1);
-
-  int unused_output_height, unused_output_width;
-  unused_output_width =
-      computeOutputSize(params().padding, width, filter_width, params().stride_width, 1);
-  unused_output_height =
-      computeOutputSize(params().padding, height, filter_height, params().stride_height, 1);
-  int32_t offset = 0;
   tflite::ConvParams op_params{};
   op_params.padding_type = tflite::PaddingType::kSame;
-  op_params.padding_values.height = computePaddingWithOffset(
-      params().stride_height, 1, height, filter_height, unused_output_height, &offset);
-  op_params.padding_values.height_offset = offset;
-  op_params.padding_values.width = computePaddingWithOffset(
-      params().stride_width, 1, width, filter_width, unused_output_width, &offset);
-  op_params.padding_values.width_offset = offset;
+  op_params.padding_values.height = _padding_height;
+  op_params.padding_values.width = _padding_width;
   op_params.stride_height = params().stride_height;
   op_params.stride_width = params().stride_width;
   op_params.output_multiplier = _output_multiplier;
-  tflite::reference_ops::TransposeConv(
-      op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
-      getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
-      getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(),
-      (float *)nullptr);
+  tflite::reference_ops::TransposeConv(op_params,                                                //
+                                       getTensorShape(input()), getTensorData<float>(input()),   //
+                                       getTensorShape(filter()), getTensorData<float>(filter()), //
+                                       getTensorShape(bias()), getTensorData<float>(bias()),     //
+                                       getTensorShape(output()), getTensorData<float>(output()), //
+                                       tflite::RuntimeShape(), nullptr);
 }
 
 void TransposeConv::evalQuantized() const
 {
-  int32_t input_offset = -input()->zero_point();
-  int32_t filter_offset = -filter()->zero_point();
-  int32_t output_offset = filter()->zero_point();
-  const int width = output()->shape().dim(2);
-  const int height = output()->shape().dim(1);
-
-  const int filter_width = filter()->shape().dim(2);
-  const int filter_height = filter()->shape().dim(1);
-
-  int unused_output_height, unused_output_width;
-  unused_output_width =
-      computeOutputSize(params().padding, width, filter_width, params().stride_width, 1);
-  unused_output_height =
-      computeOutputSize(params().padding, height, filter_height, params().stride_height, 1);
-  int32_t offset = 0;
   tflite::ConvParams op_params{};
   op_params.padding_type = tflite::PaddingType::kSame;
-  op_params.padding_values.height = computePaddingWithOffset(
-      params().stride_height, 1, height, filter_height, unused_output_height, &offset);
-  op_params.padding_values.width = computePaddingWithOffset(
-      params().stride_width, 1, width, filter_width, unused_output_width, &offset);
+  op_params.padding_values.height = _padding_height;
+  op_params.padding_values.width = _padding_width;
   op_params.stride_height = params().stride_height;
   op_params.stride_width = params().stride_width;
-  op_params.input_offset = input_offset;
-  op_params.output_offset = output_offset;
-  op_params.weights_offset = filter_offset;
+  // The kernel expects input and filter zero points to be negated.
+  op_params.input_offset = -input()->zero_point();    // Note the '-'.
+  op_params.weights_offset = -filter()->zero_point(); // Note the '-'.
+  op_params.output_offset = output()->zero_point();
   op_params.output_multiplier = _output_multiplier;
-  op_params.output_shift = -_output_shift;
+  op_params.output_shift = _output_shift;
   op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
 
-  tflite::reference_ops::TransposeConv(
-      op_params, getTensorShape(input()), getTensorData<uint8>(input()), getTensorShape(filter()),
-      getTensorData<uint8>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
-      getTensorShape(output()), getTensorData<uint8>(output()), tflite::RuntimeShape(),
-      (uint8 *)nullptr, getTensorData<int32_t>(_scratch_tensor.get()));
+  tflite::reference_ops::TransposeConv(op_params,                                                //
+                                       getTensorShape(input()), getTensorData<uint8>(input()),   //
+                                       getTensorShape(filter()), getTensorData<uint8>(filter()), //
+                                       getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
+                                       getTensorShape(output()), getTensorData<uint8>(output()), //
+                                       tflite::RuntimeShape(), nullptr,                          //
+                                       getTensorData<int32_t>(_scratch_tensor.get()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.h b/compiler/luci-interpreter/src/kernels/TransposeConv.h
index 3a0eae761..444439c65 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.h
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.h
@@ -47,6 +47,8 @@ private:
 private:
   std::unique_ptr<Tensor> _scratch_tensor;
 
+  int32_t _padding_height{};
+  int32_t _padding_width{};
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t _output_multiplier = 0;
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index 0fbe9328b..5a69e7798 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -28,21 +28,18 @@ using namespace testing;
 
 template <typename T, typename B>
 void Check(std::initializer_list<int32_t> output_shape_shape,
-           std::initializer_list<int32_t> weight_shape,
-           std::initializer_list<int32_t> input_data_shape,
+           std::initializer_list<int32_t> weight_shape, std::initializer_list<int32_t> input_shape,
            std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<int32_t> output_shape_data, std::initializer_list<T> weight_data,
-           std::initializer_list<T> input_data_data, std::initializer_list<B> bias_data,
+           std::initializer_list<T> input_data, std::initializer_list<B> bias_data,
            std::initializer_list<T> output_data, luci::Padding padding, int32_t stride_height,
-           int32_t stride_width, DataType element_type)
+           int32_t stride_width)
 {
-  Tensor output_shape_tensor{element_type, output_shape_shape, {}, ""};
-  output_shape_tensor.writeData(output_shape_data.begin(), output_shape_data.size() * sizeof(T));
-  Tensor weight_tensor{element_type, weight_shape, {}, ""};
-  weight_tensor.writeData(weight_data.begin(), weight_data.size() * sizeof(T));
-  Tensor input_data_tensor{element_type, input_data_shape, {}, ""};
-  input_data_tensor.writeData(input_data_data.begin(), input_data_data.size() * sizeof(T));
-
+  constexpr DataType element_type = getElementType<T>();
+  Tensor output_shape_tensor =
+      makeInputTensor<DataType::S32>(output_shape_shape, output_shape_data);
+  Tensor weight_tensor = makeInputTensor<element_type>(weight_shape, weight_data);
+  Tensor input_data_tensor = makeInputTensor<element_type>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(element_type);
 
   TransposeConvParams params{};
@@ -71,14 +68,13 @@ void Check(std::initializer_list<int32_t> output_shape_shape,
 TEST(TransposeConvTest, FloatSimple)
 {
   Check<float, float>(
-      /*outputShape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 1}, /*input_shape=*/{1, 4, 4, 1},
-      /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*outputShape_data=*/{1, 4, 4, 1},
+      /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 1}, /*input_shape=*/{1, 4, 4, 1},
+      /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
       /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9},
       /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
       /*bias_data=*/{},
       /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
-      /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
-      getElementType<float>());
+      /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
 
   SUCCEED();
 }
@@ -86,16 +82,15 @@ TEST(TransposeConvTest, FloatSimple)
 TEST(TransposeConvTest, FloatTwoFiltersTest)
 {
   Check<float, float>(
-      /*outputShape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 2}, /*input_shape=*/{1, 4, 4, 2},
-      /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*outputShape_data=*/{1, 4, 4, 1},
+      /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 2}, /*input_shape=*/{1, 4, 4, 2},
+      /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
       /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
       /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
                       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
       /*bias_data=*/{},
-      /*output_data=*/{184, 412, 568, 528, 678, 1347, 1689, 1434, 1494, 2715, 3057, 2442, 1968,
-                       3352, 3652, 2760},
-      /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
-      getElementType<float>());
+      /*output_data=*/
+      {184, 412, 568, 528, 678, 1347, 1689, 1434, 1494, 2715, 3057, 2442, 1968, 3352, 3652, 2760},
+      /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
 
   SUCCEED();
 }
@@ -103,28 +98,61 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
 TEST(TransposeConvTest, SimpleBiasTest)
 {
   Check<float, float>(
-      /*outputShape_shape=*/{4}, /*weight_shape=*/{2, 3, 3, 1},
+      /*output_shape_shape=*/{4}, /*weight_shape=*/{2, 3, 3, 1},
       /*input_shape=*/{1, 2, 2, 1},
-      /*bias_shape=*/{2}, /*output_shape=*/{1, 4, 4, 1}, /*outputShape_data=*/{1, 5, 5, 2},
+      /*bias_shape=*/{2}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 5, 5, 2},
       /*weight_data=*/{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
       /*input_data=*/{1, 2, 3, 4},
       /*bias_data=*/{3, 4},
       /*output_data=*/{4,  6,  6,  8,  10, 14, 9,  12, 13, 16, 10,  12,  12, 14, 28, 32, 21,
                        24, 25, 28, 19, 24, 27, 32, 65, 76, 45, 52,  57,  64, 24, 28, 30, 34,
                        64, 72, 39, 44, 47, 52, 42, 46, 48, 52, 106, 114, 63, 68, 71, 76},
-      /*params.padding=*/luci::Padding::VALID, /*stride_height=*/2, /*stride_width=*/2,
-      getElementType<float>());
+      /*params.padding=*/luci::Padding::VALID, /*stride_height=*/2, /*stride_width=*/2);
 
   SUCCEED();
 }
 
-// TODO Uint8Simple
-// Implement GetDequantizedOutput Function.
-// Create Test for Uint8 Case
+TEST(TransposeConvTest, UInt8)
+{
+  std::vector<float> input_data{1, 2, 3, 4};
+  std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  std::vector<float> bias_data{3, 4};
+  std::vector<int32_t> output_shape_data{1, 5, 5, 2};
+  std::vector<float> ref_output_data{
+      4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+      10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+      19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+      24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+      42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+  };
+
+  // Choose quantization parameters carefully.
+  auto input_quant = quantizationParams<uint8_t>(-8.0, 7.9375);  // s = 1 / 16, zp = 128
+  auto filter_quant = quantizationParams<uint8_t>(-24.0, 39.75); // s = 1 / 4, zp = 96
+  auto output_quant = quantizationParams<uint8_t>(-64.0, 191.0); // s = 1, zp = 64
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first,
+                                                      input_quant.second, input_data);
+  Tensor filter_tensor = makeInputTensor<DataType::U8>({2, 3, 3, 1}, filter_quant.first,
+                                                       filter_quant.second, filter_data);
+  Tensor bias_tensor =
+      makeInputTensor<DataType::S32>({2}, input_quant.first * filter_quant.first, 0, bias_data);
+  Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
+
+  TransposeConvParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
+                       &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
 
-// TODO Uint8FiltersTest
-// Implement GetDequantizedOutput Function.
-// Create Test for Uint8 Case
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Utils.cpp b/compiler/luci-interpreter/src/kernels/Utils.cpp
index b9e7738a9..52e76a81c 100644
--- a/compiler/luci-interpreter/src/kernels/Utils.cpp
+++ b/compiler/luci-interpreter/src/kernels/Utils.cpp
@@ -89,20 +89,23 @@ static void calculateActivationRangeQuantizedImpl(Activation activation, int32_t
 void calculateActivationRangeQuantized(Activation activation, const Tensor *output,
                                        int32_t *activation_min, int32_t *activation_max)
 {
+  // For now, assume that signed type implies signed symmetric quantization.
   int32_t qmin{};
   int32_t qmax{};
   switch (output->element_type())
   {
     case DataType::U8:
-      qmin = std::numeric_limits<uint8_t>::min();
+      qmin = 0;
       qmax = std::numeric_limits<uint8_t>::max();
       break;
     case DataType::S8:
-      qmin = std::numeric_limits<int8_t>::min();
+      assert(output->zero_point() == 0);
+      qmin = -std::numeric_limits<int8_t>::max();
       qmax = std::numeric_limits<int8_t>::max();
       break;
     case DataType::S16:
-      qmin = std::numeric_limits<int16_t>::min();
+      assert(output->zero_point() == 0);
+      qmin = -std::numeric_limits<int16_t>::max();
       qmax = std::numeric_limits<int16_t>::max();
       break;
     default:
diff --git a/compiler/luci-interpreter/src/kernels/Utils.h b/compiler/luci-interpreter/src/kernels/Utils.h
index 7927151c6..67bb7581a 100644
--- a/compiler/luci-interpreter/src/kernels/Utils.h
+++ b/compiler/luci-interpreter/src/kernels/Utils.h
@@ -25,6 +25,7 @@
 
 #include <cassert>
 #include <cstdint>
+#include <stdexcept>
 
 namespace luci_interpreter
 {
@@ -70,6 +71,11 @@ inline int32_t computeOutputSize(Padding padding, int32_t image_size, int32_t fi
   }
 }
 
+inline int32_t calcOffset(const Shape &shape, int32_t d0, int32_t d1, int32_t d2, int32_t d3)
+{
+  return ((d0 * shape.dim(1) + d1) * shape.dim(2) + d2) * shape.dim(3) + d3;
+}
+
 void calculateActivationRange(Activation activation, float *activation_min, float *activation_max);
 
 void calculateActivationRangeQuantized(Activation activation, const Tensor *output,
@@ -94,6 +100,14 @@ void quantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quan
 
 Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_shape);
 
+inline double getQuantizedConvolutionMultipler(float input_scale, float filter_scale,
+                                               float output_scale)
+{
+  const double input_product_scale = static_cast<double>(input_scale * filter_scale);
+  LUCI_INTERPRETER_CHECK(input_product_scale >= 0);
+  return input_product_scale / static_cast<double>(output_scale);
+}
+
 inline tflite::RuntimeShape getTensorShape(const Tensor *tensor)
 {
   if (tensor == nullptr)
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
index 126a1cb5b..66aa38ff0 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -23,19 +23,37 @@
 #include "kernels/Conv2D.h"
 #include "kernels/DepthToSpace.h"
 #include "kernels/DepthwiseConv2D.h"
+#include "kernels/Div.h"
 #include "kernels/Elu.h"
+#include "kernels/Floor.h"
+#include "kernels/FloorDiv.h"
+#include "kernels/Equal.h"
 #include "kernels/FullyConnected.h"
+#include "kernels/Greater.h"
+#include "kernels/GreaterEqual.h"
 #include "kernels/If.h"
 #include "kernels/L2Normalize.h"
 #include "kernels/L2Pool2D.h"
 #include "kernels/LeakyRelu.h"
+#include "kernels/Less.h"
+#include "kernels/LessEqual.h"
 #include "kernels/LocalResponseNormalization.h"
 #include "kernels/Logistic.h"
+#include "kernels/LogSoftmax.h"
+#include "kernels/Maximum.h"
 #include "kernels/MaxPool2D.h"
 #include "kernels/Mean.h"
+#include "kernels/Minimum.h"
 #include "kernels/Mul.h"
+#include "kernels/NotEqual.h"
 #include "kernels/Pad.h"
+#include "kernels/Pow.h"
+#include "kernels/Prelu.h"
+#include "kernels/Relu.h"
+#include "kernels/Relu6.h"
 #include "kernels/Reshape.h"
+#include "kernels/ResizeBilinear.h"
+#include "kernels/ResizeNearestNeighbor.h"
 #include "kernels/Reverse.h"
 #include "kernels/Rsqrt.h"
 #include "kernels/Slice.h"
@@ -44,6 +62,7 @@
 #include "kernels/Split.h"
 #include "kernels/StridedSlice.h"
 #include "kernels/Sqrt.h"
+#include "kernels/Sub.h"
 #include "kernels/Squeeze.h"
 #include "kernels/Tanh.h"
 #include "kernels/Unpack.h"
@@ -229,6 +248,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthwiseConv2D *
   return std::make_unique<kernels::DepthwiseConv2D>(input, filter, bias, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDiv *node)
+{
+  assert(node->arity() == 2);
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  DivParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Div>(input1, input2, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleElu *node)
 {
   assert(node->arity() == 1);
@@ -239,6 +271,38 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleElu *node)
   return std::make_unique<kernels::Elu>(input, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFloor *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->x());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Floor>(input, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFloorDiv *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::FloorDiv>(x, y, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleEqual *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Equal>(x, y, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *node)
 {
   assert(node->arity() == 3);
@@ -254,6 +318,28 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *n
   return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleGreater *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Greater>(x, y, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleGreaterEqual *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::GreaterEqual>(x, y, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
 {
   auto output_nodes = collectOutputNodes<luci::CircleIfOut>(node);
@@ -323,6 +409,28 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLeakyRelu *node)
   return std::make_unique<kernels::LeakyRelu>(input, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLess *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Less>(x, y, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLessEqual *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::LessEqual>(x, y, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLocalResponseNormalization *node)
 {
   assert(node->arity() == 1);
@@ -348,6 +456,27 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogistic *node)
   return std::make_unique<kernels::Logistic>(input, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogSoftmax *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->logits());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::LogSoftmax>(input, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaximum *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Maximum>(input1, input2, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaxPool2D *node)
 {
   assert(node->arity() == 1);
@@ -380,6 +509,17 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMean *node)
   return std::make_unique<kernels::Mean>(input, axes, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMinimum *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Minimum>(input1, input2, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMul *node)
 {
   assert(node->arity() == 2);
@@ -394,6 +534,17 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMul *node)
   return std::make_unique<kernels::Mul>(input1, input2, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleNotEqual *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *x = getInputTensor(node->x());
+  const Tensor *y = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::NotEqual>(x, y, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleOutput *)
 {
   throw std::runtime_error("Output node cannot be executed.");
@@ -410,6 +561,49 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CirclePad *node)
   return std::make_unique<kernels::Pad>(input, paddings, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CirclePow *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Pow>(input1, input2, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CirclePRelu *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *alpha = getInputTensor(node->alpha());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Prelu>(input, alpha, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleRelu *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->features());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Relu>(input, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleRelu6 *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->features());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Relu6>(input, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
 {
   assert(node->arity() == 2);
@@ -422,6 +616,40 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
   return std::make_unique<kernels::Reshape>(input, shape, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleResizeBilinear *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *size = getInputTensor(node->size());
+  Tensor *output = getOutputTensor(node);
+
+  ResizeBilinearParams params{};
+  params.align_corners = node->align_corners();
+  params.half_pixel_centers = node->half_pixel_centers();
+
+  return std::make_unique<kernels::ResizeBilinear>(input, size, output, params);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleResizeNearestNeighbor *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *size = getInputTensor(node->size());
+  Tensor *output = getOutputTensor(node);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = node->align_corners();
+  // TODO update half_pixel_centers after CircleResizeNearestNeighbor updated
+  // Current CircleResizeNearestNeighbor don't have half_pixel_centers.
+  // default value on current is false.
+  // it need to be updated when CircleResizeNearestNeighbor updated.
+  params.half_pixel_centers = false;
+
+  return std::make_unique<kernels::ResizeNearestNeighbor>(input, size, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReverseV2 *node)
 {
   assert(node->arity() == 2);
@@ -443,6 +671,20 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleRsqrt *node)
   return std::make_unique<kernels::Rsqrt>(input, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSub *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  SubParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Sub>(input1, input2, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
 {
   assert(node->arity() == 3);
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
index 31cb9d8fc..663104700 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
@@ -47,23 +47,42 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleDepthToSpace *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleDepthwiseConv2D *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleDiv *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleElu *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleFloor *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleFloorDiv *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleEqual *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleFullyConnected *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleGreater *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleGreaterEqual *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleIf *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleL2Normalize *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleL2Pool2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleLeakyRelu *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleLess *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleLessEqual *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleLocalResponseNormalization *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleLogistic *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleLogSoftmax *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleInput *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleMaximum *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleMaxPool2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleMean *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleMinimum *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleMul *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleNotEqual *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
   std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CirclePow *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CirclePRelu *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleRelu *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleRelu6 *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleReshape *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleResizeBilinear *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleResizeNearestNeighbor *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleRsqrt *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleSub *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
index 4e2bc3d0b..ea055542d 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -24,18 +24,36 @@
 #include <kernels/Conv2D.h>
 #include <kernels/DepthToSpace.h>
 #include <kernels/DepthwiseConv2D.h>
+#include <kernels/Div.h>
 #include <kernels/Elu.h>
+#include <kernels/Floor.h>
+#include <kernels/FloorDiv.h>
+#include <kernels/Equal.h>
 #include <kernels/FullyConnected.h>
+#include <kernels/Greater.h>
+#include <kernels/GreaterEqual.h>
 #include <kernels/L2Normalize.h>
 #include <kernels/L2Pool2D.h>
 #include <kernels/LeakyRelu.h>
+#include <kernels/Less.h>
+#include <kernels/LessEqual.h>
 #include <kernels/LocalResponseNormalization.h>
 #include <kernels/Logistic.h>
+#include <kernels/LogSoftmax.h>
+#include <kernels/Maximum.h>
 #include <kernels/MaxPool2D.h>
 #include <kernels/Mean.h>
+#include <kernels/Minimum.h>
 #include <kernels/Mul.h>
+#include <kernels/NotEqual.h>
 #include <kernels/Pad.h>
+#include <kernels/Pow.h>
+#include <kernels/Prelu.h>
+#include <kernels/Relu.h>
+#include <kernels/Relu6.h>
 #include <kernels/Reshape.h>
+#include <kernels/ResizeBilinear.h>
+#include <kernels/ResizeNearestNeighbor.h>
 #include <kernels/Reverse.h>
 #include <kernels/Rsqrt.h>
 #include <kernels/Slice.h>
@@ -43,6 +61,7 @@
 #include <kernels/SpaceToDepth.h>
 #include <kernels/Split.h>
 #include <kernels/Sqrt.h>
+#include <kernels/Sub.h>
 #include <kernels/Squeeze.h>
 #include <kernels/StridedSlice.h>
 #include <kernels/Tanh.h>
@@ -279,6 +298,26 @@ TEST_F(KernelBuilderTest, DepthwiseConv2D)
   EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
 }
 
+TEST_F(KernelBuilderTest, Div)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleDiv>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Div>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
 TEST_F(KernelBuilderTest, Elu)
 {
   auto *input = createInputNode();
@@ -293,6 +332,54 @@ TEST_F(KernelBuilderTest, Elu)
   checkTensor(kernel->output(), op);
 }
 
+TEST_F(KernelBuilderTest, Floor)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleFloor>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Floor>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, FloorDiv)
+{
+  auto *x = createInputNode();
+  auto *y = createInputNode();
+
+  auto *op = createNode<luci::CircleFloorDiv>();
+  op->x(x);
+  op->y(y);
+
+  auto kernel = buildKernel<kernels::FloorDiv>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x);
+  checkTensor(kernel->y(), y);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Equal)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::Equal>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, FullyConnected)
 {
   auto *input = createInputNode();
@@ -316,6 +403,40 @@ TEST_F(KernelBuilderTest, FullyConnected)
   EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
 }
 
+TEST_F(KernelBuilderTest, Greater)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleGreater>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::Greater>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, GreaterEqual)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleGreaterEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::GreaterEqual>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, L2Normalize)
 {
   auto *input = createInputNode();
@@ -377,6 +498,40 @@ TEST_F(KernelBuilderTest, LeakyRelu)
   EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
 }
 
+TEST_F(KernelBuilderTest, Less)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleLess>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::Less>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, LessEqual)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleLessEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::LessEqual>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, LocalResponseNormalization)
 {
   auto *input = createInputNode();
@@ -414,6 +569,37 @@ TEST_F(KernelBuilderTest, Logistic)
   checkTensor(kernel->output(), op);
 }
 
+TEST_F(KernelBuilderTest, LogSoftmax)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLogSoftmax>();
+  op->logits(input);
+
+  auto kernel = buildKernel<kernels::LogSoftmax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Maximum)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMaximum>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::Maximum>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, MaxPool2D)
 {
   auto *input = createInputNode();
@@ -461,6 +647,23 @@ TEST_F(KernelBuilderTest, Mean)
   EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
 }
 
+TEST_F(KernelBuilderTest, Minimum)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMinimum>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::Minimum>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, Mul)
 {
   auto *input1 = createInputNode();
@@ -481,6 +684,23 @@ TEST_F(KernelBuilderTest, Mul)
   EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
 }
 
+TEST_F(KernelBuilderTest, NotEqual)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleNotEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::NotEqual>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, Pad)
 {
   auto *input = createInputNode();
@@ -498,6 +718,68 @@ TEST_F(KernelBuilderTest, Pad)
   checkTensor(kernel->output(), op);
 }
 
+TEST_F(KernelBuilderTest, Pow)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CirclePow>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::Pow>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Prelu)
+{
+  auto *input = createInputNode();
+  auto *alpha = createInputNode();
+
+  auto *op = createNode<luci::CirclePRelu>();
+  op->input(input);
+  op->alpha(alpha);
+
+  auto kernel = buildKernel<kernels::Prelu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->alpha(), alpha);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Relu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleRelu>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Relu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Relu6)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleRelu6>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Relu6>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, Reshape)
 {
   auto *input = createInputNode();
@@ -515,6 +797,48 @@ TEST_F(KernelBuilderTest, Reshape)
   checkTensor(kernel->output(), op);
 }
 
+TEST_F(KernelBuilderTest, ResizeBilinear)
+{
+  auto *input = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleResizeBilinear>();
+  op->input(input);
+  op->size(size);
+  op->align_corners(true);
+  op->half_pixel_centers(true);
+
+  auto kernel = buildKernel<kernels::ResizeBilinear>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().align_corners, Eq(op->align_corners()));
+  EXPECT_THAT(kernel->params().half_pixel_centers, Eq(op->half_pixel_centers()));
+}
+
+TEST_F(KernelBuilderTest, ResizeNearestNeighbor)
+{
+  auto *input = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleResizeNearestNeighbor>();
+  op->input(input);
+  op->size(size);
+  op->align_corners(true);
+
+  auto kernel = buildKernel<kernels::ResizeNearestNeighbor>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().align_corners, Eq(op->align_corners()));
+  // TODO currently half_pixel_centers are not implemented on CircleResizeNearestNeighbor
+  // after adding, need to be updated.
+}
+
 TEST_F(KernelBuilderTest, ReverseV2)
 {
   auto *input = createInputNode();
@@ -636,6 +960,26 @@ TEST_F(KernelBuilderTest, Sqrt)
   checkTensor(kernel->output(), op);
 }
 
+TEST_F(KernelBuilderTest, Sub)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleSub>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Sub>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
 TEST_F(KernelBuilderTest, Squeeze)
 {
   auto *input = createInputNode();
diff --git a/compiler/luci/export/src/CircleExporterUtils.cpp b/compiler/luci/export/src/CircleExporterUtils.cpp
index f097e71c5..1fdb40e51 100644
--- a/compiler/luci/export/src/CircleExporterUtils.cpp
+++ b/compiler/luci/export/src/CircleExporterUtils.cpp
@@ -36,6 +36,10 @@ circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func)
       return circle::ActivationFunctionType_RELU_N1_TO_1;
     case luci::FusedActFunc::RELU6:
       return circle::ActivationFunctionType_RELU6;
+    case luci::FusedActFunc::TANH:
+      return circle::ActivationFunctionType_TANH;
+    case luci::FusedActFunc::SIGN_BIT:
+      return circle::ActivationFunctionType_SIGN_BIT;
     default:
       INTERNAL_EXN_V("trying to convert unsupported luci::FusedActFunc", oops::to_uint32(func));
   }
@@ -83,6 +87,63 @@ circle::MirrorPadMode to_circle_mirrorpadmode(luci::MirrorPadMode mode)
   }
 }
 
+circle::DimensionType to_circle_dimensiontype(luci::DimensionType type)
+{
+  switch (type)
+  {
+    case luci::DimensionType::DENSE:
+      return circle::DimensionType_DENSE;
+    case luci::DimensionType::SPARSE_CSR:
+      return circle::DimensionType_SPARSE_CSR;
+    default:
+      INTERNAL_EXN_V("trying to convert unsupported luci::DimensionType", oops::to_uint32(type));
+  }
+}
+
+flatbuffers::Offset<void> to_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
+                                                        const SparseIndexVector &sparse_idx_vec)
+{
+  auto type = sparse_idx_vec.type();
+  switch (type)
+  {
+    case luci::SparseIndexVectorType::NONE:
+      return flatbuffers::Offset<void>();
+    case luci::SparseIndexVectorType::I32:
+    {
+      return circle::CreateInt32VectorDirect(fb, sparse_idx_vec.as_int32_vector()).Union();
+    }
+    case luci::SparseIndexVectorType::U16:
+    {
+      return circle::CreateUint16VectorDirect(fb, sparse_idx_vec.as_uint16_vector()).Union();
+    }
+    case luci::SparseIndexVectorType::U8:
+    {
+      return circle::CreateUint8VectorDirect(fb, sparse_idx_vec.as_uint8_vector()).Union();
+    }
+    default:
+      INTERNAL_EXN_V("trying to convert unsupported luci::SparseIndexVectorType",
+                     oops::to_uint32(type));
+  }
+}
+
+circle::SparseIndexVector to_circle_sparse_index_vector_type(luci::SparseIndexVectorType type)
+{
+  switch (type)
+  {
+    case luci::SparseIndexVectorType::NONE:
+      return circle::SparseIndexVector_NONE;
+    case luci::SparseIndexVectorType::I32:
+      return circle::SparseIndexVector_Int32Vector;
+    case luci::SparseIndexVectorType::U16:
+      return circle::SparseIndexVector_Uint16Vector;
+    case luci::SparseIndexVectorType::U8:
+      return circle::SparseIndexVector_Uint8Vector;
+    default:
+      INTERNAL_EXN_V("trying to convert unsupported luci::SparseIndexVectorType",
+                     oops::to_uint32(type));
+  }
+}
+
 } // namespace luci
 
 namespace luci
diff --git a/compiler/luci/export/src/CircleExporterUtils.h b/compiler/luci/export/src/CircleExporterUtils.h
index f9ce6d2bf..7857213b2 100644
--- a/compiler/luci/export/src/CircleExporterUtils.h
+++ b/compiler/luci/export/src/CircleExporterUtils.h
@@ -32,6 +32,10 @@ namespace luci
 circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func);
 circle::TensorType to_circle_tensortype(loco::DataType type);
 circle::MirrorPadMode to_circle_mirrorpadmode(luci::MirrorPadMode mode);
+circle::DimensionType to_circle_dimensiontype(luci::DimensionType type);
+flatbuffers::Offset<void> to_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
+                                                        const SparseIndexVector &sparse_idx_vec);
+circle::SparseIndexVector to_circle_sparse_index_vector_type(luci::SparseIndexVectorType type);
 
 } // namespace luci
 
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index 36d61f6c9..c937109cd 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -632,6 +632,7 @@ public:
   void visit(luci::CircleCustom *) final;
   void visit(luci::CircleDepthToSpace *) final;
   void visit(luci::CircleDepthwiseConv2D *) final;
+  void visit(luci::CircleDequantize *) final;
   void visit(luci::CircleDiv *) final;
   void visit(luci::CircleElu *) final;
   void visit(luci::CircleEqual *) final;
@@ -718,6 +719,7 @@ public:
   void visit(luci::CircleTopKV2 *) final;
   void visit(luci::CircleTranspose *) final;
   void visit(luci::CircleTransposeConv *) final;
+  void visit(luci::CircleUnidirectionalSequenceLSTM *) final;
   void visit(luci::CircleUnique *) final;
   void visit(luci::CircleUnpack *) final;
   void visit(luci::CircleWhere *) final;
@@ -866,6 +868,11 @@ void OperationExporter::visit(luci::CircleDepthwiseConv2D *node)
                     .Union());
 }
 
+void OperationExporter::visit(luci::CircleDequantize *node)
+{
+  export_simple(node, circle::BuiltinOperator_DEQUANTIZE);
+}
+
 void OperationExporter::visit(luci::CircleDiv *node)
 {
   export_simple(
@@ -1371,6 +1378,17 @@ void OperationExporter::visit(luci::CircleTransposeConv *node)
                     .Union());
 }
 
+void OperationExporter::visit(luci::CircleUnidirectionalSequenceLSTM *node)
+{
+  export_simple(node, circle::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+                circle::BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+                CreateUnidirectionalSequenceLSTMOptions(
+                    _ctx.builder, to_circle_actfunc(node->fusedActivationFunction()),
+                    node->cell_clip(), node->proj_clip(), node->time_major(),
+                    node->asymmetric_quantize_inputs())
+                    .Union());
+}
+
 void OperationExporter::visit(luci::CircleUnique *node) { export_node(_ctx, node); }
 
 void OperationExporter::visit(luci::CircleUnpack *node) { export_node(_ctx, node); }
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index dc8c2fbc9..86e324698 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -63,6 +63,9 @@ public:
   luci::CircleQuantParam *quantparam(void) const { return _quantparam; }
   void quantparam(luci::CircleQuantParam *qp) { _quantparam = qp; }
 
+  luci::SparsityParam *sparsityparam(void) const { return _sparsityparam; }
+  void sparsityparam(luci::SparsityParam *sp) { _sparsityparam = sp; }
+
 private:
   std::string _name;
 
@@ -72,6 +75,7 @@ private:
 
   luci::CircleConst *_content = nullptr;
   luci::CircleQuantParam *_quantparam = nullptr;
+  luci::SparsityParam *_sparsityparam = nullptr;
 };
 
 using CircleTensorContext = std::vector<CircleTensoInfo>;
@@ -109,6 +113,7 @@ void allocateCircleTensorInfo(CircleNode *node, CircleTensorContext &ctx)
 
   tensor_info.content(dynamic_cast<luci::CircleConst *>(node));
   tensor_info.quantparam(node->quantparam());
+  tensor_info.sparsityparam(node->sparsityparam());
 
   set_tensor_index(node, tensor_index);
 
@@ -265,6 +270,8 @@ flatbuffers::Offset<circle::Buffer> encodeOpBuffer(FlatBufferBuilder &builder, l
   {
     case loco::DataType::FLOAT32:
       return encodeOpBufferByDType<loco::DataType::FLOAT32>(builder, c);
+    case loco::DataType::S8:
+      return encodeOpBufferByDType<loco::DataType::S8>(builder, c);
     case loco::DataType::S16:
       return encodeOpBufferByDType<loco::DataType::S16>(builder, c);
     case loco::DataType::S32:
@@ -308,6 +315,34 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam
                                               0, quantparam->quantized_dimension);
 }
 
+flatbuffers::Offset<circle::SparsityParameters>
+encodeSparsityParameters(FlatBufferBuilder &builder, luci::SparsityParam *sparsityparam)
+{
+  if (sparsityparam == nullptr)
+    return 0;
+
+  std::vector<flatbuffers::Offset<circle::DimensionMetadata>> dim_metadata_vec;
+  auto luci_dim_metadata = sparsityparam->dim_metadata;
+  for (auto it : luci_dim_metadata)
+  {
+    // array_segments
+    auto circle_array_segments = to_circle_sparse_index_vector(builder, it.array_segments());
+    auto circle_array_segments_type =
+        to_circle_sparse_index_vector_type(it.array_segments().type());
+
+    // array_indices
+    auto circle_array_indices = to_circle_sparse_index_vector(builder, it.array_indices());
+    auto circle_array_indices_type = to_circle_sparse_index_vector_type(it.array_indices().type());
+    auto dim_metadata = circle::CreateDimensionMetadata(
+        builder, to_circle_dimensiontype(it.format()), it.dense_size(), circle_array_segments_type,
+        circle_array_segments, circle_array_indices_type, circle_array_indices);
+    dim_metadata_vec.emplace_back(dim_metadata);
+  }
+
+  return circle::CreateSparsityParametersDirect(builder, &sparsityparam->traversal_order,
+                                                &sparsityparam->block_map, &dim_metadata_vec);
+}
+
 void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
                            SerializedModelData &md, SerializedGraphData &gd)
 {
@@ -322,12 +357,14 @@ void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &build
 
   auto quantparam = encodeQuantizationParameters(builder, info.quantparam());
 
+  auto sparsityparam = encodeSparsityParameters(builder, info.sparsityparam());
+
   auto buffer_id = static_cast<uint32_t>(md._buffers.size());
   md._buffers.push_back(buffer);
 
   auto name_offset = builder.CreateString(info.name());
   auto tensor_offset = CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset,
-                                    quantparam, /*is_variable*/ false);
+                                    quantparam, /*is_variable*/ false, sparsityparam);
   gd._tensors.push_back(tensor_offset);
 }
 
diff --git a/compiler/luci/import/include/luci/Import/CircleReader.h b/compiler/luci/import/include/luci/Import/CircleReader.h
index 3d85b9e35..388942490 100644
--- a/compiler/luci/import/include/luci/Import/CircleReader.h
+++ b/compiler/luci/import/include/luci/Import/CircleReader.h
@@ -23,6 +23,7 @@
 #include <luci/IR/AttrPadding.h>
 #include <luci/IR/CircleNode.h>
 #include <luci/IR/CircleQuantParam.h>
+#include <luci/IR/SparsityParam.h>
 
 #include <loco.h>
 
diff --git a/compiler/luci/import/include/luci/Import/Nodes.h b/compiler/luci/import/include/luci/Import/Nodes.h
index 0b21d380f..28741064e 100644
--- a/compiler/luci/import/include/luci/Import/Nodes.h
+++ b/compiler/luci/import/include/luci/Import/Nodes.h
@@ -36,6 +36,7 @@
 #include "Nodes/CircleCustom.h"
 #include "Nodes/CircleDepthToSpace.h"
 #include "Nodes/CircleDepthwiseConv2D.h"
+#include "Nodes/CircleDequantize.h"
 #include "Nodes/CircleDiv.h"
 #include "Nodes/CircleElu.h"
 #include "Nodes/CircleEqual.h"
@@ -123,6 +124,7 @@
 #include "Nodes/CircleTopKV2.h"
 #include "Nodes/CircleTranspose.h"
 #include "Nodes/CircleTransposeConv.h"
+#include "Nodes/CircleUnidirectionalSequenceLSTM.h"
 #include "Nodes/CircleUnique.h"
 #include "Nodes/CircleUnpack.h"
 #include "Nodes/CircleWhere.h"
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleDequantize.h b/compiler/luci/import/include/luci/Import/Nodes/CircleDequantize.h
new file mode 100644
index 000000000..e25b80b0e
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleDequantize.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_DEQUANTIZE_H__
+#define __LUCI_IMPORT_OP_CIRCLE_DEQUANTIZE_H__
+
+#include "luci/Import/GraphBuilder.h"
+
+namespace luci
+{
+
+class CircleDequantizeGraphBuilder : public GraphBuilder
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+private:
+  CircleNode *build_node(const circle::OperatorT &op, const std::vector<CircleNode *> &inputs,
+                         loco::Graph *graph) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_DEQUANTIZE_H__
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleUnidirectionalSequenceLSTM.h b/compiler/luci/import/include/luci/Import/Nodes/CircleUnidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..4cc3320dc
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleUnidirectionalSequenceLSTM.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_UNIDIRECTIONALSEQUENCELSTM_H__
+#define __LUCI_IMPORT_OP_CIRCLE_UNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "luci/Import/GraphBuilder.h"
+
+namespace luci
+{
+
+class CircleUnidirectionalSequenceLSTMGraphBuilder : public GraphBuilder
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+private:
+  CircleNode *build_node(const circle::OperatorT &op, const std::vector<CircleNode *> &inputs,
+                         loco::Graph *graph) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_UNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index bc7f39762..9ddc37d0a 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -115,7 +115,9 @@ FusedActFunc luci_actfunc(const circle::ActivationFunctionType type)
     case circle::ActivationFunctionType::ActivationFunctionType_RELU6:
       return luci::FusedActFunc::RELU6;
     case circle::ActivationFunctionType::ActivationFunctionType_TANH:
-      break;
+      return luci::FusedActFunc::TANH;
+    case circle::ActivationFunctionType::ActivationFunctionType_SIGN_BIT:
+      return luci::FusedActFunc::SIGN_BIT;
     default:
       break;
   }
@@ -149,6 +151,49 @@ MirrorPadMode luci_mirrorpad_mode(const circle::MirrorPadMode mode)
   return MirrorPadMode::UNDEFINED;
 }
 
+DimensionType luci_dim_type(const circle::DimensionType dim_type)
+{
+  switch (dim_type)
+  {
+    case circle::DimensionType_DENSE:
+      return DimensionType::DENSE;
+    case circle::DimensionType_SPARSE_CSR:
+      return DimensionType::SPARSE_CSR;
+    default:
+      throw std::runtime_error("Invalid DimensionType");
+  }
+}
+
+SparseIndexVector
+luci_sparse_index_vector(const circle::SparseIndexVectorUnion &sparse_index_vector)
+{
+  switch (sparse_index_vector.type)
+  {
+    case circle::SparseIndexVector_NONE:
+      return SparseIndexVector{SparseIndexVectorType::NONE, nullptr};
+    case circle::SparseIndexVector_Int32Vector:
+    {
+      const auto const_vec_ptr =
+          static_cast<const void *>(&(sparse_index_vector.AsInt32Vector()->values));
+      return SparseIndexVector{SparseIndexVectorType::I32, const_vec_ptr};
+    }
+    case circle::SparseIndexVector_Uint16Vector:
+    {
+      const auto const_vec_ptr =
+          static_cast<const void *>(&(sparse_index_vector.AsUint16Vector()->values));
+      return SparseIndexVector{SparseIndexVectorType::U16, const_vec_ptr};
+    }
+    case circle::SparseIndexVector_Uint8Vector:
+    {
+      const auto const_vec_ptr =
+          static_cast<const void *>(&(sparse_index_vector.AsUint8Vector()->values));
+      return SparseIndexVector{SparseIndexVectorType::U8, const_vec_ptr};
+    }
+    default:
+      throw std::runtime_error("Invalid SparseIndexVector type");
+  }
+}
+
 std::unique_ptr<CircleQuantParam>
 luci_quantparam(const circle::QuantizationParametersT *quantization)
 {
@@ -174,6 +219,28 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
   return nullptr;
 }
 
+std::unique_ptr<SparsityParam> luci_sparsityparam(const circle::SparsityParametersT *sparsity)
+{
+  assert(sparsity);
+  const auto &traversal_order = sparsity->traversal_order;
+  const auto &block_map = sparsity->block_map;
+  const auto &dim_metadata = sparsity->dim_metadata;
+
+  // TODO find a condition that should return nullptr
+  auto sparsityparam = std::make_unique<SparsityParam>();
+
+  sparsityparam->traversal_order = traversal_order;
+  sparsityparam->block_map = block_map;
+  for (const auto &dm : dim_metadata)
+  {
+    sparsityparam->dim_metadata.emplace_back(luci_dim_type(dm->format), dm->dense_size,
+                                             luci_sparse_index_vector(dm->array_segments),
+                                             luci_sparse_index_vector(dm->array_indices));
+  }
+
+  return sparsityparam;
+}
+
 void copy_tensor_attributes(const circle::TensorT &tensor, CircleNode *node)
 {
   node->name(tensor_name(tensor));
@@ -193,6 +260,14 @@ void copy_tensor_attributes(const circle::TensorT &tensor, CircleNode *node)
     if (quantparam)
       node->quantparam(std::move(quantparam));
   }
+
+  const auto *sparsity = tensor.sparsity.get();
+  if (sparsity != nullptr)
+  {
+    auto sparsityparam = luci_sparsityparam(sparsity);
+    if (sparsityparam)
+      node->sparsityparam(std::move(sparsityparam));
+  }
 }
 
 circle::BuiltinOperator CircleReader::builtin_code(const circle::OperatorT &op) const
diff --git a/compiler/luci/import/src/GraphBuilderRegistry.cpp b/compiler/luci/import/src/GraphBuilderRegistry.cpp
index c6bcacb54..d598d30f4 100644
--- a/compiler/luci/import/src/GraphBuilderRegistry.cpp
+++ b/compiler/luci/import/src/GraphBuilderRegistry.cpp
@@ -45,6 +45,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(COS, CircleCosGraphBuilder);                                                 // 108
   CIRCLE_NODE(DEPTH_TO_SPACE, CircleDepthToSpaceGraphBuilder);                             // 5
   CIRCLE_NODE(DEPTHWISE_CONV_2D, CircleDepthwiseConv2DGraphBuilder);                       // 4
+  CIRCLE_NODE(DEQUANTIZE, CircleDequantizeGraphBuilder);                                   // 6
   CIRCLE_NODE(DIV, CircleDivGraphBuilder);                                                 // 42
   CIRCLE_NODE(ELU, CircleEluGraphBuilder);                                                 // 111
   CIRCLE_NODE(EQUAL, CircleEqualGraphBuilder);                                             // 71
@@ -132,6 +133,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(TOPK_V2, CircleTopKV2GraphBuilder);                                          // 48
   CIRCLE_NODE(TRANSPOSE, CircleTransposeGraphBuilder);                                     // 39
   CIRCLE_NODE(TRANSPOSE_CONV, CircleTransposeConvGraphBuilder);                            // 67
+  CIRCLE_NODE(UNIDIRECTIONAL_SEQUENCE_LSTM, CircleUnidirectionalSequenceLSTMGraphBuilder); // 44
   CIRCLE_NODE(UNIQUE, CircleUniqueGraphBuilder);                                           // 103
   CIRCLE_NODE(UNPACK, CircleUnpackGraphBuilder);                                           // 88
   CIRCLE_NODE(WHERE, CircleWhereGraphBuilder);                                             // 109
@@ -140,7 +142,6 @@ GraphBuilderRegistry::GraphBuilderRegistry()
 
 #undef CIRCLE_NODE
 
-  // BuiltinOperator_DEQUANTIZE = 6,
   // BuiltinOperator_EMBEDDING_LOOKUP = 7,
   // BuiltinOperator_HASHTABLE_LOOKUP = 10,
   // BuiltinOperator_LSH_PROJECTION = 15,
@@ -152,7 +153,6 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   // BuiltinOperator_CALL = 31,
   // BuiltinOperator_EMBEDDING_LOOKUP_SPARSE = 33,
   // BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN = 35,
-  // BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   // BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
   // BuiltinOperator_DELEGATE = 51,
   // BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM = 52,
diff --git a/compiler/luci/import/src/Nodes/CircleConst.cpp b/compiler/luci/import/src/Nodes/CircleConst.cpp
index fad7a0757..f69448dfe 100644
--- a/compiler/luci/import/src/Nodes/CircleConst.cpp
+++ b/compiler/luci/import/src/Nodes/CircleConst.cpp
@@ -51,6 +51,12 @@ static void copy_data(const std::vector<uint8_t> &raw_data, uint32_t num_element
 {
   using T = typename loco::DataTypeImpl<DT>::Type;
 
+  // TODO calculate the exact buffer size of sparse tensor
+  if (const_node->sparsityparam())
+  {
+    num_elements = raw_data.size() / sizeof(T);
+  }
+
   assert(raw_data.size() == num_elements * sizeof(T));
   const auto *data = reinterpret_cast<const T *>(raw_data.data());
 
@@ -61,9 +67,6 @@ static void copy_data(const std::vector<uint8_t> &raw_data, uint32_t num_element
   }
 }
 
-//
-// circleconst_from_tensor() ?
-//
 CircleConst *create_circleconst(GraphBuilderContext *context, int32_t tensor_index)
 {
   LOGGER(l);
@@ -77,7 +80,7 @@ CircleConst *create_circleconst(GraphBuilderContext *context, int32_t tensor_ind
   std::vector<int32_t> const_dims = const_tensor.shape; // in NHWC
   if (const_dims.size() == 0 && buffer.empty())
   {
-    // unknown shape tensor
+    // unknown shape tensor and scalar tensor
     return nullptr;
   }
 
diff --git a/compiler/luci/import/src/Nodes/CircleDequantize.cpp b/compiler/luci/import/src/Nodes/CircleDequantize.cpp
new file mode 100644
index 000000000..1936da97c
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleDequantize.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleDequantize.h"
+
+#include <luci/IR/Nodes/CircleDequantize.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleDequantizeGraphBuilder::validate(const ValidateArgs &args) const
+{
+  if (args.op.inputs.size() != 1)
+    return false;
+
+  return true;
+}
+
+CircleNode *CircleDequantizeGraphBuilder::build_node(const circle::OperatorT &,
+                                                     const std::vector<CircleNode *> &inputs,
+                                                     loco::Graph *graph) const
+{
+  auto *node = graph->nodes()->create<CircleDequantize>();
+  node->input(inputs.at(0));
+
+  // No options for Dequantize
+
+  return node;
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp b/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..c41cf4def
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleUnidirectionalSequenceLSTM.h"
+
+#include <luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleUnidirectionalSequenceLSTMGraphBuilder::validate(const ValidateArgs &args) const
+{
+  if (args.op.inputs.size() != 24)
+    return false;
+
+  return true;
+}
+
+CircleNode *CircleUnidirectionalSequenceLSTMGraphBuilder::build_node(
+    const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
+{
+  auto *node = graph->nodes()->create<CircleUnidirectionalSequenceLSTM>();
+  node->input(inputs.at(0));
+  node->input_to_input_weights(inputs.at(1)); // Optional
+  node->input_to_cell_weights(inputs.at(2));
+  node->input_to_forget_weights(inputs.at(3));
+  node->input_to_output_weights(inputs.at(4));
+  node->recurrent_to_input_weights(inputs.at(5)); // Optional
+  node->recurrent_to_cell_weights(inputs.at(6));
+  node->recurrent_to_forget_weights(inputs.at(7));
+  node->recurrent_to_output_weights(inputs.at(8));
+  node->cell_to_input_weights(inputs.at(9));   // Optional
+  node->cell_to_forget_weights(inputs.at(10)); // Optional
+  node->cell_to_output_weights(inputs.at(11)); // Optional
+  node->input_gate_bias(inputs.at(12));        // Optional
+  node->forget_gate_bias(inputs.at(13));
+  node->cell_gate_bias(inputs.at(14));
+  node->output_gate_bias(inputs.at(15));
+  node->projection_weights(inputs.at(16)); // Optional
+  node->projection_bias(inputs.at(17));    // Optional
+  node->activation_state(inputs.at(18));
+  node->cell_state(inputs.at(19));
+  node->input_layer_norm_coefficients(inputs.at(20));  // Optional
+  node->forget_layer_norm_coefficients(inputs.at(21)); // Optional
+  node->cell_layer_norm_coefficients(inputs.at(22));   // Optional
+  node->output_layer_norm_coefficients(inputs.at(23)); // Optional
+  const std::vector<int32_t> optionals = {1, 5, 9, 10, 11, 12, 16, 17, 20, 21, 22, 23};
+  for (auto optional : optionals)
+  {
+    if (auto inp = dynamic_cast<luci::CircleOutputExclude *>(node->arg(optional)))
+    {
+      // CircleOutputExclude doesn't need a type, but since all nodes must have a type, a dummy type
+      // is inserted.
+      inp->dtype(loco::DataType::FLOAT32);
+    }
+  }
+
+  const auto *options = op.builtin_options.AsUnidirectionalSequenceLSTMOptions();
+  node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
+  node->cell_clip(options->cell_clip);
+  node->proj_clip(options->proj_clip);
+  node->time_major(options->time_major);
+  node->asymmetric_quantize_inputs(options->asymmetric_quantize_inputs);
+
+  return node;
+}
+
+} // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/AttrFusedActFunc.h b/compiler/luci/lang/include/luci/IR/AttrFusedActFunc.h
index 2abae604b..3f21d5858 100644
--- a/compiler/luci/lang/include/luci/IR/AttrFusedActFunc.h
+++ b/compiler/luci/lang/include/luci/IR/AttrFusedActFunc.h
@@ -28,7 +28,9 @@ enum class FusedActFunc
   NONE,
   RELU,
   RELU_N1_TO_1,
-  RELU6
+  RELU6,
+  TANH,
+  SIGN_BIT
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h b/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h
index 967103e3c..edec9d18b 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h
@@ -25,6 +25,7 @@
 #include "CircleOpcode.h"
 #include "CircleNodeVisitor.forward.h"
 #include "CircleQuantParam.h"
+#include "SparsityParam.h"
 
 #include <memory>
 
@@ -54,6 +55,12 @@ struct CircleNode : public loco::Node,
     _quantparam = std::move(quantparam);
   }
 
+  SparsityParam *sparsityparam(void) const { return _sparsityparam.get(); }
+  void sparsityparam(std::unique_ptr<SparsityParam> &&sparsityparam)
+  {
+    _sparsityparam = std::move(sparsityparam);
+  }
+
   ShapeStatus shape_status(void) const { return _shape_status; }
   void shape_status(ShapeStatus ss) { _shape_status = ss; }
 
@@ -63,6 +70,7 @@ struct CircleNode : public loco::Node,
 private:
   NodeName _name;
   std::unique_ptr<CircleQuantParam> _quantparam;
+  std::unique_ptr<SparsityParam> _sparsityparam;
   ShapeStatus _shape_status{ShapeStatus::UNDEFINED};
   int32_t _op_version = 1;
 };
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.h b/compiler/luci/lang/include/luci/IR/CircleNodes.h
index 25b86d2e9..fde0b612b 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.h
@@ -34,6 +34,7 @@
 #include "Nodes/CircleCustom.h"
 #include "Nodes/CircleDepthToSpace.h"
 #include "Nodes/CircleDepthwiseConv2D.h"
+#include "Nodes/CircleDequantize.h"
 #include "Nodes/CircleDiv.h"
 #include "Nodes/CircleElu.h"
 #include "Nodes/CircleEqual.h"
@@ -120,6 +121,7 @@
 #include "Nodes/CircleTopKV2.h"
 #include "Nodes/CircleTranspose.h"
 #include "Nodes/CircleTransposeConv.h"
+#include "Nodes/CircleUnidirectionalSequenceLSTM.h"
 #include "Nodes/CircleUnique.h"
 #include "Nodes/CircleUnpack.h"
 #include "Nodes/CircleWhere.h"
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
index 9f0a1b16e..b9d545893 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
@@ -27,6 +27,7 @@ CIRCLE_NODE(COS, luci::CircleCos)
 CIRCLE_NODE(CUSTOM, luci::CircleCustom)
 CIRCLE_NODE(DEPTH_TO_SPACE, luci::CircleDepthToSpace)
 CIRCLE_NODE(DEPTHWISE_CONV_2D, luci::CircleDepthwiseConv2D)
+CIRCLE_NODE(DEQUANTIZE, luci::CircleDequantize)
 CIRCLE_NODE(DIV, luci::CircleDiv)
 CIRCLE_NODE(ELU, luci::CircleElu)
 CIRCLE_NODE(EQUAL, luci::CircleEqual)
@@ -113,6 +114,7 @@ CIRCLE_NODE(TILE, luci::CircleTile)
 CIRCLE_NODE(TOPK_V2, luci::CircleTopKV2)
 CIRCLE_NODE(TRANSPOSE, luci::CircleTranspose)
 CIRCLE_NODE(TRANSPOSE_CONV, luci::CircleTransposeConv)
+CIRCLE_NODE(UNIDIRECTIONAL_SEQUENCE_LSTM, luci::CircleUnidirectionalSequenceLSTM)
 CIRCLE_NODE(UNIQUE, luci::CircleUnique)
 CIRCLE_NODE(UNPACK, luci::CircleUnpack)
 CIRCLE_NODE(WHERE, luci::CircleWhere)
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h
new file mode 100644
index 000000000..847c5dfc5
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCELEDEQUANTIZE_H__
+#define __LUCI_IR_CIRCELEDEQUANTIZE_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief DEQUANTIZE in Circle
+ */
+class CircleDequantize final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::DEQUANTIZE>>
+{
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCELEDEQUANTIZE_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..4352b045b
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLEUNIDIRECTIONALSEQUENCELSTM_H__
+#define __LUCI_IR_CIRCLEUNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/AttrFusedActFunc.h"
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief UNIDIRECTIONAL_SEQUENCE_LSTM in Circle
+ */
+class CircleUnidirectionalSequenceLSTM final
+    : public FixedArityNode<24, CircleNodeImpl<CircleOpcode::UNIDIRECTIONAL_SEQUENCE_LSTM>>,
+      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+{
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+  loco::Node *input_to_input_weights(void) const { return at(1)->node(); }
+  void input_to_input_weights(loco::Node *node) { at(1)->node(node); }
+  loco::Node *input_to_forget_weights(void) const { return at(2)->node(); }
+  void input_to_forget_weights(loco::Node *node) { at(2)->node(node); }
+  loco::Node *input_to_cell_weights(void) const { return at(3)->node(); }
+  void input_to_cell_weights(loco::Node *node) { at(3)->node(node); }
+  loco::Node *input_to_output_weights(void) const { return at(4)->node(); }
+  void input_to_output_weights(loco::Node *node) { at(4)->node(node); }
+
+  loco::Node *recurrent_to_input_weights(void) const { return at(5)->node(); }
+  void recurrent_to_input_weights(loco::Node *node) { at(5)->node(node); }
+  loco::Node *recurrent_to_forget_weights(void) const { return at(6)->node(); }
+  void recurrent_to_forget_weights(loco::Node *node) { at(6)->node(node); }
+  loco::Node *recurrent_to_cell_weights(void) const { return at(7)->node(); }
+  void recurrent_to_cell_weights(loco::Node *node) { at(7)->node(node); }
+  loco::Node *recurrent_to_output_weights(void) const { return at(8)->node(); }
+  void recurrent_to_output_weights(loco::Node *node) { at(8)->node(node); }
+
+  loco::Node *cell_to_input_weights(void) const { return at(9)->node(); }
+  void cell_to_input_weights(loco::Node *node) { at(9)->node(node); }
+  loco::Node *cell_to_forget_weights(void) const { return at(10)->node(); }
+  void cell_to_forget_weights(loco::Node *node) { at(10)->node(node); }
+  loco::Node *cell_to_output_weights(void) const { return at(11)->node(); }
+  void cell_to_output_weights(loco::Node *node) { at(11)->node(node); }
+
+  loco::Node *input_gate_bias(void) const { return at(12)->node(); }
+  void input_gate_bias(loco::Node *node) { at(12)->node(node); }
+  loco::Node *forget_gate_bias(void) const { return at(13)->node(); }
+  void forget_gate_bias(loco::Node *node) { at(13)->node(node); }
+  loco::Node *cell_gate_bias(void) const { return at(14)->node(); }
+  void cell_gate_bias(loco::Node *node) { at(14)->node(node); }
+  loco::Node *output_gate_bias(void) const { return at(15)->node(); }
+  void output_gate_bias(loco::Node *node) { at(15)->node(node); }
+
+  loco::Node *projection_weights(void) const { return at(16)->node(); }
+  void projection_weights(loco::Node *node) { at(16)->node(node); }
+  loco::Node *projection_bias(void) const { return at(17)->node(); }
+  void projection_bias(loco::Node *node) { at(17)->node(node); }
+
+  loco::Node *activation_state(void) const { return at(18)->node(); }
+  void activation_state(loco::Node *node) { at(18)->node(node); }
+  loco::Node *cell_state(void) const { return at(19)->node(); }
+  void cell_state(loco::Node *node) { at(19)->node(node); }
+
+  loco::Node *input_layer_norm_coefficients(void) const { return at(20)->node(); }
+  void input_layer_norm_coefficients(loco::Node *node) { at(20)->node(node); }
+  loco::Node *forget_layer_norm_coefficients(void) const { return at(21)->node(); }
+  void forget_layer_norm_coefficients(loco::Node *node) { at(21)->node(node); }
+  loco::Node *cell_layer_norm_coefficients(void) const { return at(22)->node(); }
+  void cell_layer_norm_coefficients(loco::Node *node) { at(22)->node(node); }
+  loco::Node *output_layer_norm_coefficients(void) const { return at(23)->node(); }
+  void output_layer_norm_coefficients(loco::Node *node) { at(23)->node(node); }
+
+public:
+  float cell_clip(void) const { return _cell_clip; }
+  void cell_clip(float cell_clip) { _cell_clip = cell_clip; }
+  float proj_clip(void) const { return _proj_clip; }
+  void proj_clip(float proj_clip) { _proj_clip = proj_clip; }
+  bool time_major(void) const { return _time_major; }
+  void time_major(bool time_major) { _time_major = time_major; }
+  bool asymmetric_quantize_inputs(void) const { return _asymmetric_quantize_inputs; }
+  void asymmetric_quantize_inputs(bool asymmetric_quantize_inputs)
+  {
+    _asymmetric_quantize_inputs = asymmetric_quantize_inputs;
+  }
+
+private:
+  float _cell_clip = 0.0f;
+  float _proj_clip = 0.0f;
+  bool _time_major = false;
+  bool _asymmetric_quantize_inputs = false;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLEUNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compiler/luci/lang/include/luci/IR/SparsityParam.h b/compiler/luci/lang/include/luci/IR/SparsityParam.h
new file mode 100644
index 000000000..f471e5ef9
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/SparsityParam.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_SPARSITYPARAM_H__
+#define __LUCI_IR_SPARSITYPARAM_H__
+
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace luci
+{
+
+enum DimensionType
+{
+  DENSE,
+  SPARSE_CSR,
+};
+
+enum SparseIndexVectorType
+{
+  NONE,
+  I32,
+  U16,
+  U8,
+};
+
+class SparseIndexVector
+{
+public:
+  SparseIndexVector() = default;
+  SparseIndexVector(const SparseIndexVectorType &type, const std::vector<int32_t> &sparse_index_vec)
+      : _type{type}
+  {
+    switch (type)
+    {
+      case SparseIndexVectorType::NONE:
+        break;
+      case SparseIndexVectorType::I32:
+      {
+        _vec_ptr = static_cast<void *>(
+            new std::vector<int32_t>(sparse_index_vec.begin(), sparse_index_vec.end()));
+        break;
+      }
+      case SparseIndexVectorType::U16:
+      {
+        auto new_vec = new std::vector<uint16_t>(sparse_index_vec.size());
+        for (uint32_t idx = 0; idx < sparse_index_vec.size(); idx++)
+        {
+          new_vec->at(idx) = static_cast<uint16_t>(sparse_index_vec.at(idx));
+        }
+        _vec_ptr = static_cast<void *>(new_vec);
+        break;
+      }
+      case SparseIndexVectorType::U8:
+      {
+        auto new_vec = new std::vector<uint8_t>(sparse_index_vec.size());
+        for (uint32_t idx = 0; idx < sparse_index_vec.size(); idx++)
+        {
+          new_vec->at(idx) = static_cast<uint8_t>(sparse_index_vec.at(idx));
+        }
+        _vec_ptr = static_cast<void *>(new_vec);
+        break;
+      }
+      default:
+        std::runtime_error("Invalid SparseIndexVectorType");
+    }
+  }
+
+  SparseIndexVector(SparseIndexVectorType type, const void *sparse_index_vec) : _type{type}
+  {
+    switch (type)
+    {
+      case SparseIndexVectorType::NONE:
+        break;
+      case SparseIndexVectorType::I32:
+      {
+        const std::vector<int32_t> *vec =
+            static_cast<const std::vector<int32_t> *>(sparse_index_vec);
+        _vec_ptr = static_cast<void *>(new std::vector<int32_t>(vec->begin(), vec->end()));
+        break;
+      }
+      case SparseIndexVectorType::U16:
+      {
+        const std::vector<uint16_t> *vec =
+            static_cast<const std::vector<uint16_t> *>(sparse_index_vec);
+        _vec_ptr = static_cast<void *>(new std::vector<uint16_t>(vec->begin(), vec->end()));
+        break;
+      }
+      case SparseIndexVectorType::U8:
+      {
+        const std::vector<uint8_t> *vec =
+            static_cast<const std::vector<uint8_t> *>(sparse_index_vec);
+        _vec_ptr = static_cast<void *>(new std::vector<uint8_t>(vec->begin(), vec->end()));
+        break;
+      }
+      default:
+        std::runtime_error("Invalid SparseIndexVectorType");
+    }
+  }
+
+  SparseIndexVector(const SparseIndexVector &sparse_index_vec)
+      : SparseIndexVector(sparse_index_vec._type, sparse_index_vec._vec_ptr)
+  {
+  }
+
+  SparseIndexVector(SparseIndexVector &&sparse_index_vec)
+      : _type{sparse_index_vec._type}, _vec_ptr{std::exchange(sparse_index_vec._vec_ptr, nullptr)}
+  {
+  }
+
+  SparseIndexVector &operator=(const SparseIndexVector &sparse_index_vec)
+  {
+    return *this = SparseIndexVector(sparse_index_vec);
+  }
+
+  SparseIndexVector &operator=(SparseIndexVector &&sparse_index_vector)
+  {
+    std::swap(_vec_ptr, sparse_index_vector._vec_ptr);
+    std::swap(_type, sparse_index_vector._type);
+    return *this;
+  }
+
+  ~SparseIndexVector()
+  {
+    switch (_type)
+    {
+      case SparseIndexVectorType::NONE:
+        break;
+      case SparseIndexVectorType::I32:
+      {
+        auto vec_ptr = static_cast<std::vector<int32_t> *>(_vec_ptr);
+        delete vec_ptr;
+        break;
+      }
+      case SparseIndexVectorType::U16:
+      {
+        auto vec_ptr = static_cast<std::vector<uint16_t> *>(_vec_ptr);
+        delete vec_ptr;
+        break;
+      }
+      case SparseIndexVectorType::U8:
+      {
+        auto vec_ptr = static_cast<std::vector<uint8_t> *>(_vec_ptr);
+        delete vec_ptr;
+        break;
+      }
+      default:
+        break;
+    }
+    _vec_ptr = nullptr;
+    _type = SparseIndexVectorType::NONE;
+  }
+
+public:
+  SparseIndexVectorType type(void) const { return _type; }
+
+public:
+  const std::vector<int32_t> *as_int32_vector(void) const
+  {
+    return _type == SparseIndexVectorType::I32 ? static_cast<const std::vector<int32_t> *>(_vec_ptr)
+                                               : nullptr;
+  }
+  const std::vector<uint16_t> *as_uint16_vector(void) const
+  {
+    return _type == SparseIndexVectorType::U16
+               ? static_cast<const std::vector<uint16_t> *>(_vec_ptr)
+               : nullptr;
+  }
+  const std::vector<uint8_t> *as_uint8_vector(void) const
+  {
+    return _type == SparseIndexVectorType::U8 ? static_cast<const std::vector<uint8_t> *>(_vec_ptr)
+                                              : nullptr;
+  }
+
+private:
+  SparseIndexVectorType _type{SparseIndexVectorType::NONE};
+  void *_vec_ptr{nullptr};
+};
+
+class DimMetaData
+{
+public:
+  DimMetaData() = delete;
+  DimMetaData(DimensionType format, int32_t dense_size) : _format{format}, _dense_size{dense_size}
+  {
+    // DO NOTHING
+  }
+  DimMetaData(DimensionType format, int32_t dense_size, const SparseIndexVector &array_segments,
+              const SparseIndexVector &array_indices)
+      : _format{format}, _dense_size{dense_size}, _array_segments{array_segments},
+        _array_indices{array_indices}
+  {
+    // DO NOTHING
+  }
+
+public:
+  DimensionType format(void) const { return _format; }
+  int32_t dense_size(void) const { return _dense_size; }
+  const SparseIndexVector &array_segments(void) const { return _array_segments; }
+  const SparseIndexVector &array_indices(void) const { return _array_indices; }
+
+private:
+  DimensionType _format{DimensionType::DENSE};
+  int32_t _dense_size{0};
+  SparseIndexVector _array_segments;
+  SparseIndexVector _array_indices;
+};
+
+struct SparsityParam
+{
+  std::vector<int32_t> traversal_order;
+  std::vector<int32_t> block_map;
+  std::vector<DimMetaData> dim_metadata;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_SPARSITYPARAM_H__
diff --git a/compiler/luci/lang/src/Nodes/CircleDequantize.test.cpp b/compiler/luci/lang/src/Nodes/CircleDequantize.test.cpp
new file mode 100644
index 000000000..c3a132c60
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleDequantize.test.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleDequantize.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+TEST(CircleDequantizeTest, constructor)
+{
+  luci::CircleDequantize dequant_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), dequant_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::DEQUANTIZE, dequant_node.opcode());
+
+  ASSERT_EQ(nullptr, dequant_node.input());
+}
+
+TEST(CircleDequantizeTest, common_NEG)
+{
+  luci::CircleDequantize dequant_node;
+
+  dequant_node.name("name");
+  ASSERT_EQ("name", dequant_node.name());
+
+  auto q = std::make_unique<luci::CircleQuantParam>();
+  dequant_node.quantparam(std::move(q));
+  ASSERT_NE(nullptr, dequant_node.quantparam());
+
+  ASSERT_EQ(luci::ShapeStatus::UNDEFINED, dequant_node.shape_status());
+  dequant_node.shape_status(luci::ShapeStatus::NOSHAPE);
+  ASSERT_NE(luci::ShapeStatus::UNDEFINED, dequant_node.shape_status());
+}
+
+TEST(CircleDequantizeTest, input_NEG)
+{
+  luci::CircleDequantize dequant_node;
+  luci::CircleDequantize node;
+
+  dequant_node.input(&node);
+  ASSERT_NE(nullptr, dequant_node.input());
+
+  dequant_node.input(nullptr);
+  ASSERT_EQ(nullptr, dequant_node.input());
+}
+
+TEST(CircleDequantizeTest, arity_NEG)
+{
+  luci::CircleDequantize dequant_node;
+
+  ASSERT_NO_THROW(dequant_node.arg(0));
+  ASSERT_THROW(dequant_node.arg(1), std::out_of_range);
+}
+
+TEST(CircleDequantizeTest, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CircleDequantize dequant_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(dequant_node.accept(&tv), std::exception);
+}
+
+TEST(CircleDequantizeTest, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CircleDequantize dequant_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(dequant_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/lang/src/Nodes/CircleSparseToDense.test.cpp b/compiler/luci/lang/src/Nodes/CircleSparseToDense.test.cpp
index 03f612ba7..073be6bcb 100644
--- a/compiler/luci/lang/src/Nodes/CircleSparseToDense.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleSparseToDense.test.cpp
@@ -33,7 +33,7 @@ TEST(CircleSparseToDenseTest, constructor)
   ASSERT_EQ(nullptr, stb_node.values());
   ASSERT_EQ(nullptr, stb_node.default_value());
 
-  ASSERT_EQ(false, stb_node.validate_indices());
+  ASSERT_FALSE(stb_node.validate_indices());
 }
 
 TEST(CircleSparseToDenseTest, input_NEG)
diff --git a/compiler/luci/lang/src/Nodes/CircleSum.test.cpp b/compiler/luci/lang/src/Nodes/CircleSum.test.cpp
index 84b51d671..f9d07b200 100644
--- a/compiler/luci/lang/src/Nodes/CircleSum.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleSum.test.cpp
@@ -30,7 +30,7 @@ TEST(CircleSumTest, constructor_P)
 
   ASSERT_EQ(nullptr, sum_node.input());
   ASSERT_EQ(nullptr, sum_node.reduction_indices());
-  ASSERT_EQ(false, sum_node.keep_dims());
+  ASSERT_FALSE(sum_node.keep_dims());
 }
 
 TEST(CircleSumTest, input_NEG)
diff --git a/compiler/luci/lang/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp b/compiler/luci/lang/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
new file mode 100644
index 000000000..6b00d6f4c
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleUnidirectionalSequenceLSTMTest, constructor_P)
+{
+  luci::CircleUnidirectionalSequenceLSTM trc_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), trc_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::UNIDIRECTIONAL_SEQUENCE_LSTM, trc_node.opcode());
+
+  ASSERT_EQ(nullptr, trc_node.input());
+
+  ASSERT_EQ(nullptr, trc_node.input_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.input_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.input_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.input_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.recurrent_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.recurrent_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.recurrent_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.recurrent_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.cell_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.cell_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.cell_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.input_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.forget_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.cell_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.output_gate_bias());
+
+  ASSERT_EQ(nullptr, trc_node.projection_weights());
+  ASSERT_EQ(nullptr, trc_node.projection_bias());
+
+  ASSERT_EQ(nullptr, trc_node.activation_state());
+  ASSERT_EQ(nullptr, trc_node.cell_state());
+
+  ASSERT_EQ(nullptr, trc_node.input_layer_norm_coefficients());
+  ASSERT_EQ(nullptr, trc_node.forget_layer_norm_coefficients());
+  ASSERT_EQ(nullptr, trc_node.cell_layer_norm_coefficients());
+  ASSERT_EQ(nullptr, trc_node.output_layer_norm_coefficients());
+
+  ASSERT_EQ(luci::FusedActFunc::UNDEFINED, trc_node.fusedActivationFunction());
+  ASSERT_EQ(0.f, trc_node.cell_clip());
+  ASSERT_EQ(0.f, trc_node.proj_clip());
+  ASSERT_EQ(false, trc_node.time_major());
+  ASSERT_EQ(false, trc_node.asymmetric_quantize_inputs());
+}
+
+TEST(CircleUnidirectionalSequenceLSTMTest, arity_NEG)
+{
+  luci::CircleUnidirectionalSequenceLSTM trc_node;
+
+  ASSERT_NO_THROW(trc_node.arg(20));
+  ASSERT_THROW(trc_node.arg(24), std::out_of_range);
+}
+
+TEST(CircleUnidirectionalSequenceLSTMTest, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CircleUnidirectionalSequenceLSTM trc_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(trc_node.accept(&tv), std::exception);
+}
+
+TEST(CircleUnidirectionalSequenceLSTMTest, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CircleUnidirectionalSequenceLSTM trc_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(trc_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/logex/src/FormattedGraph.cpp b/compiler/luci/logex/src/FormattedGraph.cpp
index bb7c73d5f..5b543e769 100644
--- a/compiler/luci/logex/src/FormattedGraph.cpp
+++ b/compiler/luci/logex/src/FormattedGraph.cpp
@@ -92,6 +92,10 @@ const char *to_str(luci::FusedActFunc fused)
       return "RELU_N1_TO_1";
     case luci::FusedActFunc::RELU6:
       return "RELU6";
+    case luci::FusedActFunc::TANH:
+      return "TANH";
+    case luci::FusedActFunc::SIGN_BIT:
+      return "SIGN_BIT";
     default:
       return "Error";
   }
@@ -210,6 +214,7 @@ private:
   IMPLEMENT(luci::CircleCustom)
   IMPLEMENT(luci::CircleDepthToSpace)
   IMPLEMENT(luci::CircleDepthwiseConv2D)
+  IMPLEMENT(luci::CircleDequantize)
   IMPLEMENT(luci::CircleDiv)
   IMPLEMENT(luci::CircleElu)
   IMPLEMENT(luci::CircleExp)
@@ -294,6 +299,7 @@ private:
   IMPLEMENT(luci::CircleTopKV2)
   IMPLEMENT(luci::CircleTranspose)
   IMPLEMENT(luci::CircleTransposeConv)
+  IMPLEMENT(luci::CircleUnidirectionalSequenceLSTM)
   IMPLEMENT(luci::CircleUnique)
   IMPLEMENT(luci::CircleUnpack)
   IMPLEMENT(luci::CircleWhere)
@@ -980,12 +986,61 @@ bool summary_node(const locop::SymbolTable *tbl, const luci::CircleTransposeConv
   s.args().append("inputSizes", tbl->lookup(node->inputSizes()));
   s.args().append("filter", tbl->lookup(node->filter()));
   s.args().append("outBackprop", tbl->lookup(node->outBackprop()));
+  s.args().append("bias", tbl->lookup(node->bias()));
   s.args().append("stride(h,w)", to_str(node->stride()));
   s.args().append("padding", to_str(node->padding()));
   s.state(locop::NodeSummary::State::Complete);
   return true;
 }
 
+bool summary_node(const locop::SymbolTable *tbl, const luci::CircleUnidirectionalSequenceLSTM *node,
+                  locop::NodeSummary &s)
+{
+  s.args().append("input", tbl->lookup(node->input()));
+
+  s.args().append("input_to_input_weights", tbl->lookup(node->input_to_input_weights()));
+  s.args().append("input_to_forget_weights", tbl->lookup(node->input_to_forget_weights()));
+  s.args().append("input_to_cell_weights", tbl->lookup(node->input_to_cell_weights()));
+  s.args().append("input_to_output_weights", tbl->lookup(node->input_to_output_weights()));
+
+  s.args().append("recurrent_to_input_weights", tbl->lookup(node->recurrent_to_input_weights()));
+  s.args().append("recurrent_to_forget_weights", tbl->lookup(node->recurrent_to_forget_weights()));
+  s.args().append("recurrent_to_cell_weights", tbl->lookup(node->recurrent_to_cell_weights()));
+  s.args().append("recurrent_to_output_weights", tbl->lookup(node->recurrent_to_output_weights()));
+
+  s.args().append("cell_to_input_weights", tbl->lookup(node->cell_to_input_weights()));
+  s.args().append("cell_to_forget_weights", tbl->lookup(node->cell_to_forget_weights()));
+  s.args().append("cell_to_output_weights", tbl->lookup(node->cell_to_output_weights()));
+
+  s.args().append("input_gate_bias", tbl->lookup(node->input_gate_bias()));
+  s.args().append("forget_gate_bias", tbl->lookup(node->forget_gate_bias()));
+  s.args().append("cell_gate_bias", tbl->lookup(node->cell_gate_bias()));
+  s.args().append("output_gate_bias", tbl->lookup(node->output_gate_bias()));
+
+  s.args().append("projection_weights", tbl->lookup(node->projection_weights()));
+  s.args().append("projection_bias", tbl->lookup(node->projection_bias()));
+
+  s.args().append("activation_state", tbl->lookup(node->activation_state()));
+  s.args().append("cell_state", tbl->lookup(node->cell_state()));
+
+  s.args().append("input_layer_norm_coefficients",
+                  tbl->lookup(node->input_layer_norm_coefficients()));
+  s.args().append("forget_layer_norm_coefficients",
+                  tbl->lookup(node->forget_layer_norm_coefficients()));
+  s.args().append("cell_layer_norm_coefficients",
+                  tbl->lookup(node->cell_layer_norm_coefficients()));
+  s.args().append("output_layer_norm_coefficients",
+                  tbl->lookup(node->output_layer_norm_coefficients()));
+
+  s.args().append("cell_clip", to_str(node->cell_clip()));
+  s.args().append("proj_clip", to_str(node->proj_clip()));
+  s.args().append("time_major", to_str(node->time_major()));
+  s.args().append("asymmetric_quantize_inputs", to_str(node->asymmetric_quantize_inputs()));
+
+  s.state(locop::NodeSummary::State::Complete);
+  return true;
+}
+
 bool summary_node(const locop::SymbolTable *tbl, const luci::CircleUnique *node,
                   locop::NodeSummary &s)
 {
@@ -1225,6 +1280,12 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleDepthwiseConv2D *node,
   return summary_node(tbl(), node, s);
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleDequantize *node,
+                                       locop::NodeSummary &s) const
+{
+  return use_input(tbl(), node, s);
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleDiv *node, locop::NodeSummary &s) const
 {
   return use_xy(tbl(), node, s);
@@ -1686,6 +1747,12 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleTransposeConv *node,
   return summary_node(tbl(), node, s);
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleUnidirectionalSequenceLSTM *node,
+                                       locop::NodeSummary &s) const
+{
+  return summary_node(tbl(), node, s);
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleUnique *node, locop::NodeSummary &s) const
 {
   return summary_node(tbl(), node, s);
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index a832844f8..32ab85ef5 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -32,6 +32,7 @@ public:
   {
     enum Algorithm
     {
+      FuseAddWithTConv,
       FuseBatchNormWithTConv,
       FuseBCQ,
       FuseInstanceNorm,
@@ -41,13 +42,23 @@ public:
       QuantizeDequantizeWeights,
       QuantizeWithMinMax,
       Requantize,
+      FoldDequantize,
+      SparsifyTensorPass,
     };
 
     enum AlgorithmParameters
     {
+      // quantize
       Quantize_input_dtype,
       Quantize_output_dtype,
-      Quantize_granularity // layer-wise or channel-wise
+      Quantize_granularity, // layer-wise or channel-wise
+
+      // sparsify
+      Sparsify_tensor_name,
+      Sparsify_traversal_order,
+      Sparsify_format,
+      Sparsify_block_size,
+      Sparsify_block_map,
     };
 
     virtual ~Options() = default;
@@ -67,6 +78,8 @@ public:
 
   void quantize(loco::Graph *) const;
 
+  void sparsify(loco::Graph *) const;
+
 private:
   std::unique_ptr<Options> _options;
 };
diff --git a/compiler/luci/pass/include/luci/Pass/FoldDequantizePass.h b/compiler/luci/pass/include/luci/Pass/FoldDequantizePass.h
new file mode 100644
index 000000000..07610d3e1
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FoldDequantizePass.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FOLD_DEQUANTIZE_PASS_H__
+#define __LUCI_FOLD_DEQUANTIZE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fold Dequantize, which can be folded by constant inputs
+ *
+ */
+struct FoldDequantizePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FOLD_DEQUANTIZE"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FOLD_DEQUANTIZE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FuseAddWithTConvPass.h b/compiler/luci/pass/include/luci/Pass/FuseAddWithTConvPass.h
new file mode 100644
index 000000000..89b120397
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseAddWithTConvPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_ADD_WITH_TCONV_PASS_H__
+#define __LUCI_FUSE_ADD_WITH_TCONV_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse Add into CircleTransposeConv
+ */
+struct FuseAddWithTConvPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseAddWithTConvPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_ADD_WITH_TCONV_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h b/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h
new file mode 100644
index 000000000..41f43bf88
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SPARSIFY_TENSOR_PASS_H__
+#define __LUCI_SPARSIFY_TENSOR_PASS_H__
+
+#include <logo/Pass.h>
+
+#include <luci/IR/SparsityParam.h>
+
+namespace luci
+{
+
+class CircleConst;
+
+/**
+ * @brief  Pass to sparsify tensor
+ */
+struct SparsifyTensorPass final : public logo::Pass
+{
+public:
+  SparsifyTensorPass(const std::string &tensor_name, const std::vector<int32_t> &traversal_order,
+                     const std::vector<DimensionType> &format,
+                     const std::vector<int32_t> &block_size, const std::vector<int32_t> &block_map)
+      : _tensor_name{tensor_name}, _traversal_order{traversal_order}, _format{format},
+        _block_size{block_size}, _block_map{block_map}
+  {
+    // DO NOTHING
+  }
+
+public:
+  const char *name(void) const final { return "luci::SparsifyTensorPass"; }
+
+  bool run(loco::Graph *g) final;
+
+  template <loco::DataType DT> void sparsify_tensor(luci::CircleConst *cop);
+
+private:
+  // Tensor name that the pass will sparsify
+  std::string _tensor_name;
+  std::vector<int32_t> _traversal_order;
+  std::vector<DimensionType> _format;
+  std::vector<int32_t> _block_size;
+  std::vector<int32_t> _block_map;
+};
+
+extern template void
+SparsifyTensorPass::sparsify_tensor<loco::DataType::S32>(luci::CircleConst *cop);
+extern template void
+SparsifyTensorPass::sparsify_tensor<loco::DataType::S8>(luci::CircleConst *cop);
+extern template void
+SparsifyTensorPass::sparsify_tensor<loco::DataType::FLOAT32>(luci::CircleConst *cop);
+
+} // namespace luci
+
+#endif // __LUCI_SPARSIFY_TENSOR_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 2ee759b4e..0e6056ffc 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -16,6 +16,8 @@
 
 #include "luci/CircleOptimizer.h"
 
+#include "luci/Pass/FoldDequantizePass.h"
+#include "luci/Pass/FuseAddWithTConvPass.h"
 #include "luci/Pass/FuseBatchNormWithTConv.h"
 #include "luci/Pass/FuseBCQPass.h"
 #include "luci/Pass/FuseInstanceNormPass.h"
@@ -25,6 +27,7 @@
 #include "luci/Pass/RequantizePass.h"
 #include "luci/Pass/QuantizeWithMinMaxPass.h"
 #include "luci/Pass/QuantizeDequantizeWeightsPass.h"
+#include "luci/Pass/SparsifyTensorPass.h"
 // TODO add more passes
 
 #include "luci/Pass/ShapeInferencePass.h"
@@ -40,10 +43,25 @@
 #include <logo/Phase.h>
 
 #include <memory>
+#include <sstream>
 
 namespace
 {
 
+std::vector<int> parseIntFromCommadelimitedStr(std::string str)
+{
+  std::vector<int> ret;
+  std::istringstream is(str);
+  for (uint32_t i; is >> i;)
+  {
+    assert(i != ',');
+    ret.push_back(i);
+    if (is.peek() == ',')
+      is.ignore();
+  }
+  return ret;
+}
+
 using namespace luci;
 
 class OptimizeOptionsImpl final : public luci::CircleOptimizer::Options
@@ -132,6 +150,14 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<FuseBatchNormWithTConvPass>());
   }
+  if (_options->query(Options::Algorithm::FuseAddWithTConv))
+  {
+    phase.emplace_back(std::make_unique<FuseAddWithTConvPass>());
+  }
+  if (_options->query(Options::Algorithm::FoldDequantize))
+  {
+    phase.emplace_back(std::make_unique<luci::FoldDequantizePass>());
+  }
 
   // Shape inference is needed for added nodes doing above transformations
   phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
@@ -151,7 +177,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   if (_options->query(Options::Algorithm::QuantizeDequantizeWeights))
   {
     static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
-    static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
+    static const std::vector<std::string> fakeq_supported_output_dtype{"uint8", "int16"};
     static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
 
     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
@@ -187,7 +213,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   if (_options->query(Options::Algorithm::QuantizeWithMinMax))
   {
     static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
-    static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
+    static const std::vector<std::string> qwmm_supported_output_dtype{"uint8", "int16"};
     static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
 
     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
@@ -244,4 +270,41 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   phase_runner.run(phase);
 }
 
+void CircleOptimizer::sparsify(loco::Graph *g) const
+{
+  if (_options->query(Options::Algorithm::SparsifyTensorPass))
+  {
+    std::string tensor_name = _options->param(Options::AlgorithmParameters::Sparsify_tensor_name);
+    std::string str_tarversal_order =
+        _options->param(Options::AlgorithmParameters::Sparsify_traversal_order);
+    std::string str_format = _options->param(Options::AlgorithmParameters::Sparsify_format);
+    std::string str_block_size = _options->param(Options::AlgorithmParameters::Sparsify_block_size);
+    std::string str_block_map = _options->param(Options::AlgorithmParameters::Sparsify_block_map);
+
+    // traversal order
+    std::vector<int32_t> traversal_order = parseIntFromCommadelimitedStr(str_tarversal_order);
+    // format
+    std::vector<DimensionType> format;
+    std::istringstream is(str_format);
+    for (char c; is >> c;)
+    {
+      assert(c != ',');
+      if (c == 'd')
+        format.push_back(DimensionType::DENSE);
+      else if (c == 's')
+        format.push_back(DimensionType::SPARSE_CSR);
+      if (is.peek() == ',')
+        is.ignore();
+    }
+    // block size
+    std::vector<int32_t> block_size = parseIntFromCommadelimitedStr(str_block_size);
+    // block map
+    std::vector<int32_t> block_map = parseIntFromCommadelimitedStr(str_block_map);
+
+    luci::SparsifyTensorPass sparsifier{tensor_name, traversal_order, format, block_size,
+                                        block_map};
+    sparsifier.run(g);
+  }
+}
+
 } // namespace luci
diff --git a/compiler/luci/pass/src/FoldDequantizePass.cpp b/compiler/luci/pass/src/FoldDequantizePass.cpp
new file mode 100644
index 000000000..01c04f478
--- /dev/null
+++ b/compiler/luci/pass/src/FoldDequantizePass.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldDequantizePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco/Service/TypeInference.h>
+
+namespace
+{
+
+bool is_hybrid_kernel_supported(loco::Node *node)
+{
+  if (dynamic_cast<luci::CircleFullyConnected *>(node) != nullptr)
+    return true;
+
+  return false;
+}
+
+bool is_foldable_const(luci::CircleConst *node)
+{
+  if (node->quantparam() == nullptr)
+    return false;
+
+  if (node->dtype() == loco::DataType::S8)
+    return true;
+  if (node->dtype() == loco::DataType::U8)
+    return true;
+
+  return false;
+}
+
+luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
+{
+  if (const_node->quantparam() == nullptr)
+  {
+    throw std::runtime_error("Given constant node has no quantization parameter");
+  }
+
+  auto g = const_node->graph();
+  auto new_const_node = g->nodes()->create<luci::CircleConst>();
+
+  new_const_node->dtype(loco::DataType::FLOAT32);
+  new_const_node->rank(const_node->rank());
+  uint32_t dim_size = 1;
+  for (uint32_t i = 0; i < new_const_node->rank(); ++i)
+  {
+    new_const_node->dim(i) = const_node->dim(i);
+    dim_size *= const_node->dim(i).value();
+  }
+  new_const_node->size<loco::DataType::FLOAT32>(dim_size);
+  new_const_node->shape_status(luci::ShapeStatus::VALID);
+
+  const int32_t q_dim = const_node->quantparam()->quantized_dimension;
+  const int32_t q_dim_value = const_node->dim(q_dim).value();
+
+  int32_t right_count = q_dim_value;
+  for (uint32_t i = q_dim + 1; i < const_node->rank(); ++i)
+    right_count *= const_node->dim(i).value();
+
+  if (const_node->dtype() == loco::DataType::S8)
+  {
+    for (uint32_t i = 0; i < const_node->size<loco::DataType::S8>(); ++i)
+    {
+      uint32_t qd = (i % right_count) / (right_count / q_dim_value);
+      if (qd >= const_node->quantparam()->zerop.size())
+        qd = 0;
+
+      new_const_node->at<loco::DataType::FLOAT32>(i) =
+          (float)(const_node->at<loco::DataType::S8>(i) - const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+    }
+  }
+  else
+  {
+    for (uint32_t i = 0; i < const_node->size<loco::DataType::U8>(); ++i)
+    {
+      uint32_t qd = (i % right_count) / (right_count / q_dim_value);
+      if (qd >= const_node->quantparam()->zerop.size())
+        qd = 0;
+
+      new_const_node->at<loco::DataType::FLOAT32>(i) =
+          (float)((int)const_node->at<loco::DataType::U8>(i) -
+                  const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+    }
+  }
+
+  return new_const_node;
+}
+
+bool replace_const_node(loco::Node *node, luci::CircleConst *const_node)
+{
+  if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+  {
+    gather->params(dequantized_const_node(const_node));
+    gather->dtype(loco::DataType::FLOAT32);
+    return true;
+  }
+  else
+  {
+    // TODO Support more ops
+    return false;
+  }
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ *
+ * Folding pattern 1 - When input of Dequantize is foldable constant
+ *
+ * [Before]
+ *     quantized_const_input ---------- Dequantize ---------- Op ---
+ *                             +-- Op1_with_quant_input ---
+ *                             +-- Op2_with_quant_input ---
+ *
+ * [After]
+ *   dequantized_const_input -------------------------------- Op ---
+ *
+ *     quantized_const_input ----- Op1_with_quant_input ---
+ *                             +-- Op2_with_quant_input ---
+ *
+ *
+ * Folding pattern 2 - When input of Dequantize uses quantized output value
+ *
+ * [Before]
+ *     quantized_const_input ----- Gather ----- Dequantize --- Op ---
+ *                             +-- Op1_with_quant_input ---
+ *                             +-- Op2_with_quant_input ---
+ *
+ * [After]
+ *   dequantized_const_input ------Gather -------------------- Op ---
+ *
+ *     quantized_const_input ----- Op1_with_quant_input ---
+ *                             +-- Op2_with_quant_input ---
+ *
+ *
+ */
+bool FoldDequantizePass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::all_nodes(g))
+  {
+    if (auto circle_dequant = dynamic_cast<luci::CircleDequantize *>(node))
+    {
+      if (auto const_input = dynamic_cast<luci::CircleConst *>(circle_dequant->input()))
+      {
+        // Pattern 1 - When input of Dequantize is foldable constant
+        if (is_foldable_const(const_input))
+        {
+          loco::replace(circle_dequant).with(dequantized_const_node(const_input));
+          changed = true;
+        }
+      }
+    }
+    else if (auto const_node = dynamic_cast<luci::CircleConst *>(node))
+    {
+      if (is_foldable_const(const_node))
+      {
+        for (auto const_node_user : loco::succs(const_node))
+        {
+          // If user is hybrid kernel supported operation, do not dequantize
+          if (is_hybrid_kernel_supported(const_node_user))
+            continue;
+
+          auto users = loco::succs(const_node_user);
+          if (users.size() > 1)
+            continue;
+
+          // Pattern 2 - When input of Dequantize uses quantized output value
+          if (auto dequant = dynamic_cast<luci::CircleDequantize *>(*users.begin()))
+          {
+            if (replace_const_node(const_node_user, const_node))
+            {
+              loco::replace(dequant).with(const_node_user);
+              changed = true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseAddWithTConvPass.cpp b/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
new file mode 100644
index 000000000..bd7805f6a
--- /dev/null
+++ b/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseAddWithTConvPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+/**
+ *  Fuse add to TCONV if possible
+ *
+ *  BEFORE
+ *
+ *         [CircleTransposeConv]
+ *                  |
+ *                [add]
+ *  AFTER
+ *
+ *         [CircleTransposeConv]
+ */
+bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
+{
+  // check whether it has bias or not. This optimization works only if it doesn't.
+  auto bias = dynamic_cast<luci::CircleOutputExclude *>(tconv->bias());
+  if (not bias)
+    return false;
+
+  // get weight of tconv
+  auto filter = dynamic_cast<luci::CircleConst *>(tconv->filter());
+  if (not filter)
+    return false;
+  if (filter->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  // get add node
+  auto tconv_output = loco::succs(tconv);
+  assert(tconv_output.size() == 1);
+  auto add = dynamic_cast<luci::CircleAdd *>(*tconv_output.begin());
+  if (not add)
+    return false;
+  if (add->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (add->fusedActivationFunction() != luci::FusedActFunc::NONE &&
+      add->fusedActivationFunction() != luci::FusedActFunc::RELU6)
+    return false;
+
+  // get addition
+  luci::CircleConst *addition = nullptr;
+  if (add->x() == tconv)
+    addition = dynamic_cast<luci::CircleConst *>(add->y());
+  else
+    addition = dynamic_cast<luci::CircleConst *>(add->x());
+
+  if (not addition)
+    return false;
+
+  // addition dim(0) == tconv filter channel dim
+  if (addition->rank() != 1)
+    return false;
+  auto addition_dim = addition->dim(0).value();
+  auto filter_channel_dim = filter->dim(0).value();
+  if (filter_channel_dim != addition_dim)
+    return false;
+
+  // fuse addition with transposed conv
+  tconv->bias(addition);
+
+  if (add->fusedActivationFunction() == luci::FusedActFunc::RELU6)
+  {
+    // separate relu op from add op
+    auto relu = add->graph()->nodes()->create<luci::CircleRelu6>();
+    relu->features(tconv);
+
+    // remove add node
+    replace(add).with(relu);
+  }
+  else
+  {
+    replace(add).with(tconv);
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseAddWithTConvPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto tconv = dynamic_cast<luci::CircleTransposeConv *>(node);
+    if (not tconv)
+      continue;
+
+    if (fuse_add_with_tconv(tconv))
+      changed = true;
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
index 7aa2e3e80..ebf28779b 100644
--- a/compiler/luci/pass/src/FuseBCQPass.cpp
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -17,163 +17,139 @@
 #include "luci/Pass/FuseBCQPass.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Log.h>
 
 #include <cassert>
-#include <string>
 #include <set>
 
 namespace
 {
 
-/**
- * @brief Circle nodes including BCQ information and a circle node to which BCQ will be applied
- *        are connected with their name. And their names include common prefix.
- *        However, after pb file is converted to tflite file, some nodes' name are changed.
- *        Thus this function will return original common prefix.
- *
- * @note  All the re-naming rule of TFLite converter is not figured out.
- *        Therefore, if new naming rule is detected, this function should be updated.
- */
-const std::string node_name_prefix(luci::NodeName node_name)
-{
-  std::string prefix = node_name;
-
-  if (prefix.find("/ReadVariableOp/resource") != std::string::npos)
-  {
-    const auto start_index = prefix.find("/ReadVariableOp/resource");
-
-    const auto left_prefix = prefix.substr(0, start_index);
-    const auto right_prefix = prefix.substr(start_index + 24);
-
-    prefix = left_prefix + right_prefix;
-  }
-
-  if (prefix.find("Tensordot/") != std::string::npos)
-  {
-    const auto index = prefix.find("Tensordot/");
-    prefix = prefix.substr(0, index - 1);
-  }
-  else if (prefix.find("/MatMul") != std::string::npos)
-  {
-    const auto index = prefix.find("/MatMul");
-    prefix = prefix.substr(0, index);
-  }
-  else if (prefix.find("kernel/") != std::string::npos)
-  {
-    const auto index = prefix.find("kernel/");
-    prefix = prefix.substr(0, index - 1);
-  }
-  else if (prefix.find("/bcqinfo_") != std::string::npos)
-  {
-    const auto index = prefix.find("/bcqinfo_");
-    prefix = prefix.substr(0, index);
-  }
-
-  return prefix;
-}
-
-/**
- * @brief Create CircleOutputExclude operation, which has same shape and dtype with
- *        original circle_node.
- */
-luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node)
-{
-  auto graph = circle_node->graph();
-  auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
-
-  if (circle_node->shape_status() == luci::ShapeStatus::VALID)
-  {
-    noOp->dtype(circle_node->dtype());
-    noOp->rank(circle_node->rank());
-    for (uint32_t i = 0; i < circle_node->rank(); ++i)
-      noOp->dim(i) = circle_node->dim(i);
-  }
-  else
-  {
-    // For type inference
-    noOp->dtype(loco::DataType::FLOAT32);
-  }
-
-  return noOp;
-};
-
-} // namespace
-
-namespace
-{
-
 // V means the version of BCQ.
 template <int32_t V> class BCQFuser;
 
 template <> class BCQFuser<1>
 {
 public:
+  BCQFuser<1>(int32_t original_output_cnt, int32_t bundle_cnt)
+      : _original_output_cnt{original_output_cnt}, _bundle_cnt{bundle_cnt}
+  {
+    // Do nothing
+  }
+
+public:
   bool fuseBCQ(loco::Graph *g)
   {
-    bool changed = false;
 
-    for (auto node : loco::all_nodes(g))
+    const auto output_nodes = loco::output_nodes(g);
+    for (auto node : output_nodes)
     {
-      if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+      auto output_node = loco::must_cast<luci::CircleOutput *>(node);
+
+      /**
+       * First output of model is metadata for BCQ. Please refer to following example.
+       *
+       * When original_output_cnt is 2,
+       * BCQ_METADATA, original_output_1, original_output_2, BCQ_INFO_1, ...
+       */
+      if ((int)output_node->index() > _original_output_cnt)
       {
-        add_BCQ_info_node(circle_const);
+        const auto prefix = (output_node->index() - (_original_output_cnt + 1)) / (_bundle_cnt);
+        const MetadataType metadata_type = static_cast<MetadataType>(
+            (output_node->index() - (_original_output_cnt + 1)) % (_bundle_cnt));
+        const auto circle_node = loco::must_cast<luci::CircleNode *>(output_node->from());
+        add_BCQ_info_node(prefix, metadata_type, circle_node);
       }
     }
 
     if (!is_bcqinfo_valid())
       return false;
 
-    for (auto node : loco::active_nodes(loco::output_nodes(g)))
+    for (auto f : _fusable_op)
     {
+      auto prefix = f.first;
+      luci::CircleNode *node = f.second;
+
+      if (!is_valid_prefix(prefix))
+        continue;
+
+      // Fuse Gather to BCQGather
       if (auto gather = dynamic_cast<luci::CircleGather *>(node))
       {
-        auto params = dynamic_cast<luci::CircleConst *>(gather->params());
-        if (params != nullptr && has_BCQ_info(params))
+        if (auto params = dynamic_cast<luci::CircleConst *>(gather->params()))
         {
           auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
 
           bcq_gather->op_version(1);
-          bcq_gather->input_scales(get_alpha(params));
-          bcq_gather->input_binary(get_packed_binary_code(params));
+          bcq_gather->input_scales(_alpha[prefix]);
+          bcq_gather->input_binary(_packed_binary_code[prefix]);
           bcq_gather->indices(gather->indices());
-          bcq_gather->input_clusters(packed_clusters(params));
+          bcq_gather->input_clusters(packed_clusters(g, prefix));
 
-          // input_binary shape : [output_size, hidden_size]
-          const auto binary_hidden_size =
-              loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
-          bcq_gather->input_hidden_size(binary_hidden_size);
-
-          if (do_w_x(params))
+          if (_do_w_x[prefix]->at<loco::DataType::BOOL>(0))
           {
+            bcq_gather->input_hidden_size(params->dim(1).value());
             bcq_gather->axis(gather->axis());
+            loco::replace(gather).with(bcq_gather);
           }
           else
           {
+            bcq_gather->input_hidden_size(params->dim(0).value());
             const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
             bcq_gather->axis(axis_transpose);
+
+            const auto indices_rank =
+                loco::must_cast<luci::CircleNode *>(gather->indices())->rank();
+
+            auto perm = g->nodes()->create<luci::CircleConst>();
+            perm->dtype(loco::DataType::S32);
+            perm->size<loco::DataType::S32>(1 + indices_rank);
+            perm->rank(1);
+            perm->dim(0) = 1 + indices_rank;
+            for (uint32_t idx = 0; idx < indices_rank; ++idx)
+              perm->at<loco::DataType::S32>(idx) = idx + 1;
+            perm->at<loco::DataType::S32>(indices_rank) = 0;
+            perm->shape_status(luci::ShapeStatus::VALID);
+
+            auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+            output_transpose->a(bcq_gather);
+            output_transpose->perm(perm);
+
+            loco::replace(gather).with(output_transpose);
           }
 
-          loco::replace(gather).with(bcq_gather);
+          return true;
+        }
+      }
 
-          changed = true;
+      // Einsum is unpacked to FullyConnected, Pack and Reshape
+      if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
+      {
+        node = dynamic_cast<luci::CircleNode *>(reshape->tensor());
+      }
+      if (auto pack = dynamic_cast<luci::CirclePack *>(node))
+      {
+        if (pack->values_count() == 1 && pack->rank() == 3)
+        {
+          node = dynamic_cast<luci::CircleNode *>(pack->values(0));
         }
       }
-      else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+
+      // Fuse FullyConnected to BCQFullyConnected
+      if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
       {
-        auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
-        if (weights != nullptr && has_BCQ_info(weights))
+        if (auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights()))
         {
           auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
 
           bcq_fc->op_version(1);
-          bcq_fc->weights_scales(get_alpha(weights));
-          bcq_fc->weights_binary(get_packed_binary_code(weights));
+          bcq_fc->weights_scales(_alpha[prefix]);
+          bcq_fc->weights_binary(_packed_binary_code[prefix]);
           bcq_fc->bias(fully_connected->bias());
-          bcq_fc->weights_clusters(packed_clusters(weights));
+          bcq_fc->weights_clusters(packed_clusters(g, prefix));
           bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
 
           loco::Node *bcq_input = fully_connected->input();
-          int32_t batch_rank = 0;
 
           // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
           const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
@@ -200,27 +176,18 @@ public:
             reshape->shape(new_shape);
 
             bcq_input = reshape;
-            batch_rank = original_input->rank() - 2;
           }
 
           // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
-          if (do_w_x(weights))
+          if (_do_w_x[prefix]->at<loco::DataType::BOOL>(0))
           {
-            const auto binary_hidden_size =
-                loco::must_cast<luci::CircleNode *>(fully_connected->input())
-                    ->dim(batch_rank)
-                    .value();
-            bcq_fc->weights_hidden_size(binary_hidden_size);
+            bcq_fc->weights_hidden_size(weights->dim(0).value());
             bcq_fc->input(bcq_input);
             loco::replace(fully_connected).with(bcq_fc);
           }
           else
           {
-            const auto binary_hidden_size =
-                loco::must_cast<luci::CircleNode *>(fully_connected->input())
-                    ->dim(1 + batch_rank)
-                    .value();
-            bcq_fc->weights_hidden_size(binary_hidden_size);
+            bcq_fc->weights_hidden_size(weights->dim(1).value());
 
             auto perm = g->nodes()->create<luci::CircleConst>();
             perm->dtype(loco::DataType::S32);
@@ -244,159 +211,183 @@ public:
             loco::replace(fully_connected).with(output_transpose);
           }
 
-          changed = true;
+          return true;
+        }
+        else
+        {
+          // TODO Is there any case that input() is constant, instead of weights()?
         }
       }
     }
 
-    if (changed)
-      clear_BCQ_nodes();
-
-    return changed;
+    return false;
   }
 
 private:
-  void add_BCQ_info_node(luci::CircleConst *node)
+  enum MetadataType
   {
-    const auto node_name = node->name();
-    const auto prefix = node_name_prefix(node_name);
-
-    // If bcqinfo_* nodes are held by Reshape operation,
-    // shape of bcqinfo_* nodes are copied to `shape` input of Reshape operation.
-    // Then the name becomes bcqinfo_*_copy_shape.
-    // We should prevent this node not to added to bcq information.
-    if (node_name.find("_copy_shape") != std::string::npos)
+    DO_W_X,
+    ALPHA,
+    BINARY_CODE,
+    NUM_OF_CLUSTERS,
+    SIZE_OF_CLUSTERS,
+    QBITS_OF_CLUSTERS,
+    FUSABLE_OP,
+    DEQUANT_WEIGHT,
+  };
+
+  void add_BCQ_info_node(int32_t prefix, MetadataType metadata_type, luci::CircleNode *node)
+  {
+    if (metadata_type == MetadataType::FUSABLE_OP)
+    {
+      _fusable_op[prefix] = node;
       return;
+    }
 
-    if (node_name.find("bcqinfo_do_w_x") != std::string::npos)
-      _do_w_x[prefix] = node;
-    else if (node_name.find("bcqinfo_alpha") != std::string::npos)
-      _alpha[prefix] = node;
-    else if (node_name.find("bcqinfo_packed_binary_code") != std::string::npos)
-      _packed_binary_code[prefix] = node;
-    else if (node_name.find("bcqinfo_number_of_clusters") != std::string::npos)
-      _number_of_clusters[prefix] = node;
-    else if (node_name.find("bcqinfo_size_of_clusters") != std::string::npos)
-      _size_of_clusters[prefix] = node;
-    else if (node_name.find("bcqinfo_qbits_of_clusters") != std::string::npos)
-      _qbits_of_clusters[prefix] = node;
-    else if (node_name.find("bcqinfo_dequant_weight") != std::string::npos)
-      _dequant_weight[prefix] = node;
-  }
+    luci::CircleConst *const_node;
 
-  bool has_BCQ_info(luci::CircleConst *node)
-  {
-    const auto prefix = node_name_prefix(node->name());
-    bool has_info = true;
-
-    has_info &= (_do_w_x.find(prefix) != _do_w_x.end());
-    has_info &= (_alpha.find(prefix) != _alpha.end());
-    has_info &= (_packed_binary_code.find(prefix) != _packed_binary_code.end());
-    has_info &= (_number_of_clusters.find(prefix) != _number_of_clusters.end());
-    has_info &= (_size_of_clusters.find(prefix) != _size_of_clusters.end());
-    has_info &= (_qbits_of_clusters.find(prefix) != _qbits_of_clusters.end());
-    // bcqinfo_dequant_weight is just for validation, so not always exists.
-
-    return has_info;
+    // Converter in TensorFlow v1.x sometimes generate Reshape op
+    if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
+      const_node = loco::must_cast<luci::CircleConst *>(reshape->tensor());
+    else
+      const_node = loco::must_cast<luci::CircleConst *>(node);
+
+    if (metadata_type == MetadataType::DO_W_X)
+      _do_w_x[prefix] = const_node;
+    else if (metadata_type == MetadataType::ALPHA)
+      _alpha[prefix] = const_node;
+    else if (metadata_type == MetadataType::BINARY_CODE)
+      _packed_binary_code[prefix] = const_node;
+    else if (metadata_type == MetadataType::NUM_OF_CLUSTERS)
+      _number_of_clusters[prefix] = const_node;
+    else if (metadata_type == MetadataType::SIZE_OF_CLUSTERS)
+      _size_of_clusters[prefix] = const_node;
+    else if (metadata_type == MetadataType::QBITS_OF_CLUSTERS)
+      _qbits_of_clusters[prefix] = const_node;
+    else
+      _dequant_weight[prefix] = const_node;
   }
 
-  /**
-   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
-   *        from graph output by using CircleOutputExclude
-   */
-  void clear_BCQ_nodes()
+  bool is_bcqinfo_valid()
   {
-    auto clear_nodes = [](std::map<std::string, luci::CircleConst *> &nodes) {
-      for (auto &n : nodes)
+    LOGGER(l);
+
+    for (auto n : _do_w_x)
+    {
+      // do_w_x should be BOOL type
+      if (n.second->dtype() != loco::DataType::BOOL)
       {
-        auto node = n.second;
+        WARN(l) << "FuseBCQPass : do_w_x has wrong type" << std::endl;
+        return false;
+      }
+    }
 
-        for (auto s : loco::succs(node))
-        {
-          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
-          {
-            outnode->from(createNoOp(node));
-          }
-          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
-          {
-            for (auto o : loco::succs(reshape_node))
-            {
-              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
-              circle_output->from(createNoOp(reshape_node));
-            }
-          }
-        }
+    for (auto n : _alpha)
+    {
+      // alpha should be FLOAT32 type
+      if (n.second->dtype() != loco::DataType::FLOAT32)
+      {
+        WARN(l) << "FuseBCQPass : alpha has wrong type" << std::endl;
+        return false;
       }
-    };
-
-    clear_nodes(_do_w_x);
-    clear_nodes(_alpha);
-    clear_nodes(_packed_binary_code);
-    clear_nodes(_number_of_clusters);
-    clear_nodes(_size_of_clusters);
-    clear_nodes(_qbits_of_clusters);
-    clear_nodes(_dequant_weight);
-  }
+    }
 
-  bool is_bcqinfo_valid()
-  {
-    // do_w_x should be int32 or bool type
-    for (auto n : _do_w_x)
+    for (auto n : _packed_binary_code)
+    {
+      // packed_binary_code should be INT32 type
+      if (n.second->dtype() != loco::DataType::S32)
+      {
+        WARN(l) << "FuseBCQPass : packed_binary_code has wrong type" << std::endl;
+        return false;
+      }
+    }
+
+    for (auto n : _number_of_clusters)
     {
-      if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32)
+      // number_of_clusters should be INT32 type
+      if (n.second->dtype() != loco::DataType::S32)
+      {
+        WARN(l) << "FuseBCQPass : number_of_clusters has wrong type" << std::endl;
         return false;
+      }
     }
 
+    for (auto n : _size_of_clusters)
+    {
+      // size_of_clusters should be INT32 type
+      if (n.second->dtype() != loco::DataType::S32)
+      {
+        WARN(l) << "FuseBCQPass : size_of_clusters has wrong type" << std::endl;
+        return false;
+      }
+    }
+
+    for (auto n : _qbits_of_clusters)
+    {
+      // qbits_of_clusters should be INT32 type
+      if (n.second->dtype() != loco::DataType::S32)
+      {
+        WARN(l) << "FuseBCQPass : qbits_of_clusters has wrong type" << std::endl;
+        return false;
+      }
+    }
+
+    // As dequant_weight is not used for fusing, skip validation.
+
     return true;
   }
 
-private:
-  bool do_w_x(luci::CircleConst *node)
+  bool is_valid_prefix(int32_t prefix)
   {
-    const auto prefix = node_name_prefix(node->name());
+    LOGGER(l);
 
-    if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
-      return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
-    else
-      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
-  }
+    if (_do_w_x.find(prefix) == _do_w_x.end())
+    {
+      WARN(l) << "do_w_x is not found" << std::endl;
+      return false;
+    }
 
-  luci::CircleConst *get_alpha(luci::CircleConst *node)
-  {
-    const auto prefix = node_name_prefix(node->name());
-    return _alpha[prefix];
-  }
+    if (_alpha.find(prefix) == _alpha.end())
+    {
+      WARN(l) << "alpha is not found" << std::endl;
+      return false;
+    }
 
-  luci::CircleConst *get_packed_binary_code(luci::CircleConst *node)
-  {
-    const auto prefix = node_name_prefix(node->name());
-    return _packed_binary_code[prefix];
-  }
+    if (_packed_binary_code.find(prefix) == _packed_binary_code.end())
+    {
+      WARN(l) << "packed_binary_code is not found" << std::endl;
+      return false;
+    }
 
-  luci::CircleConst *get_number_of_clusters(luci::CircleConst *node)
-  {
-    const auto prefix = node_name_prefix(node->name());
-    return _number_of_clusters[prefix];
-  }
+    if (_number_of_clusters.find(prefix) == _number_of_clusters.end())
+    {
+      WARN(l) << "number_of_clusters is not found" << std::endl;
+      return false;
+    }
 
-  luci::CircleConst *get_size_of_clusters(luci::CircleConst *node)
-  {
-    const auto prefix = node_name_prefix(node->name());
-    return _size_of_clusters[prefix];
-  }
+    if (_size_of_clusters.find(prefix) == _size_of_clusters.end())
+    {
+      WARN(l) << "size_of_clusters is not found" << std::endl;
+      return false;
+    }
 
-  luci::CircleConst *get_qbits_of_clusters(luci::CircleConst *node)
-  {
-    const auto prefix = node_name_prefix(node->name());
-    return _qbits_of_clusters[prefix];
+    if (_qbits_of_clusters.find(prefix) == _qbits_of_clusters.end())
+    {
+      WARN(l) << "qbits_of_clusters is not found" << std::endl;
+      return false;
+    }
+
+    // As dequant_weight is not used for fusing, skip validation.
+
+    return true;
   }
 
-  luci::CircleConst *packed_clusters(luci::CircleConst *node)
+private:
+  luci::CircleConst *packed_clusters(loco::Graph *graph, int32_t prefix)
   {
-    auto graph = node->graph();
-    auto qbits_of_clusters = get_qbits_of_clusters(node);
-    auto size_of_clusters = get_size_of_clusters(node);
-    const auto number_of_clusters = get_number_of_clusters(node)->at<loco::DataType::S32>(0);
+    auto qbits_of_clusters = _qbits_of_clusters[prefix];
+    auto size_of_clusters = _size_of_clusters[prefix];
+    const auto number_of_clusters = _number_of_clusters[prefix]->at<loco::DataType::S32>(0);
 
     auto packed_clusters = graph->nodes()->create<luci::CircleConst>();
     packed_clusters->dtype(loco::DataType::S32);
@@ -418,13 +409,18 @@ private:
   }
 
 private:
-  std::map<std::string, luci::CircleConst *> _do_w_x;
-  std::map<std::string, luci::CircleConst *> _alpha;
-  std::map<std::string, luci::CircleConst *> _packed_binary_code;
-  std::map<std::string, luci::CircleConst *> _number_of_clusters;
-  std::map<std::string, luci::CircleConst *> _size_of_clusters;
-  std::map<std::string, luci::CircleConst *> _qbits_of_clusters;
-  std::map<std::string, luci::CircleConst *> _dequant_weight;
+  std::map<int32_t, luci::CircleConst *> _do_w_x;
+  std::map<int32_t, luci::CircleConst *> _alpha;
+  std::map<int32_t, luci::CircleConst *> _packed_binary_code;
+  std::map<int32_t, luci::CircleConst *> _number_of_clusters;
+  std::map<int32_t, luci::CircleConst *> _size_of_clusters;
+  std::map<int32_t, luci::CircleConst *> _qbits_of_clusters;
+  std::map<int32_t, luci::CircleConst *> _dequant_weight;
+  std::map<int32_t, luci::CircleNode *> _fusable_op;
+
+private:
+  int32_t _original_output_cnt = 0;
+  int32_t _bundle_cnt = 0;
 };
 
 } // namespace
@@ -436,38 +432,72 @@ bool FuseBCQPass::run(loco::Graph *g)
 {
   bool changed = false;
 
-  // Find BCQ version information and check validity.
-  luci::CircleConst *version_node = nullptr;
-  for (auto node : loco::all_nodes(g))
+  const int32_t start_magicnum = -2e9 + 27;
+  const int32_t end_magicnum = 2e9 - 27;
+
+  luci::CircleConst *metadata_node = nullptr;
+  for (auto node : loco::output_nodes(g))
   {
-    if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+    auto output_node = loco::must_cast<luci::CircleOutput *>(node);
+
+    // Metadata node should be first output
+    if (output_node->index() != 0)
+      continue;
+
+    // Metadata should be constant and dtype should be S32
+    auto const_node = dynamic_cast<luci::CircleConst *>(output_node->from());
+    if (const_node == nullptr || const_node->dtype() != loco::DataType::S32)
+      continue;
+
+    // Metadata has at least four elements
+    const auto element_cnt = const_node->size<loco::DataType::S32>();
+    if (element_cnt < 4)
+      continue;
+
+    // Metadata has magic numbers at first and at last
+    const auto start_value = const_node->at<loco::DataType::S32>(0);
+    const auto end_value = const_node->at<loco::DataType::S32>(element_cnt - 1);
+    if (start_value == start_magicnum && end_value == end_magicnum)
     {
-      if (circle_const->name().find("/bcqinfo_version") != std::string::npos)
-      {
-        // There should be only one bcqinfo_version in the model
-        if (version_node != nullptr)
-        {
-          assert(false && "Multiple version information found");
-          return false;
-        }
-
-        version_node = circle_const;
-      }
+      metadata_node = const_node;
+      break;
     }
   }
 
-  // If version node is not found, regard it as version 1.
-  int32_t bcq_version = (version_node != nullptr) ? version_node->at<loco::DataType::S32>(0) : 1;
+  if (metadata_node != nullptr)
+  {
+    const auto bcq_version = metadata_node->at<loco::DataType::S32>(1);
+    const auto original_output_cnt = metadata_node->at<loco::DataType::S32>(2);
 
-  if (bcq_version == 1)
-    changed = BCQFuser<1>().fuseBCQ(g);
-  else
-    assert(false && "Not supported BCQ version");
+    if (bcq_version == 1)
+    {
+      const auto bundle_cnt = metadata_node->at<loco::DataType::S32>(3);
 
-  if (changed && version_node != nullptr)
-  {
-    // If BCQ is applied and version node was found, remove the node.
-    loco::replace(version_node).with(createNoOp(version_node));
+      BCQFuser<1> fuser{original_output_cnt, bundle_cnt};
+      if (fuser.fuseBCQ(g))
+        changed = true;
+    }
+    else
+    {
+      LOGGER(l);
+      WARN(l) << "Not supported BCQ version is found." << std::endl;
+    }
+
+    // Remove all of BCQ information nodes iff there is no change
+    if (changed == false)
+    {
+      for (auto node : loco::output_nodes(g))
+      {
+        auto output_node = loco::must_cast<luci::CircleOutput *>(node);
+        if (output_node->index() == 0 || (int)output_node->index() > original_output_cnt)
+        {
+          auto noOp = g->nodes()->create<luci::CircleOutputExclude>();
+          noOp->dtype(loco::DataType::FLOAT32); // TODO Remove this setting
+          output_node->from(noOp);
+          changed = true;
+        }
+      }
+    }
   }
 
   return changed;
diff --git a/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp b/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp
index e39455b1a..95ccd8176 100644
--- a/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp
+++ b/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp
@@ -77,11 +77,11 @@ bool fused_batch_norm_with_tconv(luci::CircleTransposeConv *tconv)
   // scale dim(0) == tconv filter channel dim
   if (filter->rank() != 4)
     return false;
-  auto filter_channel_dim = filter->dim(3).value();
+  auto filter_out_dim = filter->dim(0).value();
   if (scale->rank() != 1)
     return false;
   auto scale_dim = scale->dim(0).value();
-  if (filter_channel_dim != scale_dim)
+  if (filter_out_dim != scale_dim)
     return false;
 
   // get shift of batchnorm
@@ -93,23 +93,23 @@ bool fused_batch_norm_with_tconv(luci::CircleTransposeConv *tconv)
   if (shift->rank() != 1)
     return false;
   auto shift_dim = shift->dim(0).value();
-  if (filter_channel_dim != shift_dim)
+  if (filter_out_dim != shift_dim)
     return false;
 
   // filter weight = filter weight * mul(scale) + add(shift)
-  uint32_t filter_batch_dim = filter->dim(0).value();
   uint32_t filter_height_dim = filter->dim(1).value();
   uint32_t filter_width_dim = filter->dim(2).value();
-  for (uint32_t c = 0; c < filter_channel_dim; c++)
+  uint32_t filter_in_dim = filter->dim(3).value();
+  for (uint32_t c = 0; c < filter_out_dim; c++)
   {
-    for (uint32_t n = 0; n < filter_batch_dim; n++)
+    for (uint32_t h = 0; h < filter_height_dim; h++)
     {
-      for (uint32_t h = 0; h < filter_height_dim; h++)
+      for (uint32_t w = 0; w < filter_width_dim; w++)
       {
-        for (uint32_t w = 0; w < filter_width_dim; w++)
+        for (uint32_t b = 0; b < filter_in_dim; b++)
         {
-          uint32_t offset = n * filter_height_dim * filter_width_dim * filter_channel_dim +
-                            h * filter_width_dim * filter_channel_dim + w * filter_channel_dim + c;
+          uint32_t offset = c * filter_height_dim * filter_width_dim * filter_in_dim +
+                            h * filter_width_dim * filter_in_dim + w * filter_in_dim + b;
           filter->at<loco::DataType::FLOAT32>(offset) *= scale->at<loco::DataType::FLOAT32>(c);
         }
       }
diff --git a/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp b/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp
new file mode 100644
index 000000000..0f8d562e9
--- /dev/null
+++ b/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizationUtils.h"
+
+#include <luci/IR/CircleQuantParam.h>
+
+#include <math.h>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+void addQuantParam(luci::CircleNode &node, const std::vector<float> &scale,
+                   const std::vector<int64_t> &zp)
+{
+  assert(node.quantparam() == nullptr);
+
+  auto quantparam = std::make_unique<luci::CircleQuantParam>();
+  quantparam->scale = scale;
+  quantparam->zerop = zp;
+  node.quantparam(std::move(quantparam));
+}
+
+int32_t quantize(float f, luci::CircleQuantParam *qparam)
+{
+  float scale = qparam->scale[0];
+  int64_t zp = qparam->zerop[0];
+
+  return std::round(f / scale) + zp;
+}
+
+class SimpleConcatGraph
+{
+public:
+  SimpleConcatGraph(loco::DataType quant_type)
+  {
+    concat_node.dtype(quant_type);
+    concat_node.fusedActivationFunction(luci::FusedActFunc::NONE);
+    input_1.dtype(quant_type);
+    input_2.dtype(quant_type);
+
+    concat_node.values(0, &input_1);
+    concat_node.values(1, &input_2);
+
+    if (quant_type == loco::DataType::U8)
+    {
+      addQuantParam(concat_node, {3.14}, {77});
+      addQuantParam(input_1, {1.0}, {1});
+      addQuantParam(input_2, {2.0}, {2});
+    }
+    else if (quant_type == loco::DataType::S16)
+    {
+      addQuantParam(concat_node, {3.14}, {0});
+      addQuantParam(input_1, {1.0}, {0});
+      addQuantParam(input_2, {2.0}, {0});
+    }
+    else
+    {
+      throw std::runtime_error("Unsupported quantization type");
+    }
+  }
+
+  ~SimpleConcatGraph()
+  {
+    concat_node.values(0, nullptr);
+    concat_node.values(1, nullptr);
+  }
+
+public:
+  luci::CircleConcatenation concat_node{2};
+  luci::CircleConv2D input_1;
+  luci::CircleConv2D input_2;
+};
+
+class SubsequentConcatGraph
+{
+public:
+  SubsequentConcatGraph(loco::DataType quant_type)
+  {
+    concat_node.dtype(quant_type);
+    concat_node.fusedActivationFunction(luci::FusedActFunc::NONE);
+    input_1.dtype(quant_type);
+    input_2.dtype(quant_type);
+
+    concat_node.values(0, &input_1);
+    concat_node.values(1, &input_2);
+
+    if (quant_type == loco::DataType::U8)
+    {
+      addQuantParam(concat_node, {3.14}, {77});
+      addQuantParam(input_1, {1.0}, {1});
+      addQuantParam(input_2, {2.0}, {2});
+    }
+    else if (quant_type == loco::DataType::S16)
+    {
+      addQuantParam(concat_node, {3.14}, {0});
+      addQuantParam(input_1, {1.0}, {0});
+      addQuantParam(input_2, {2.0}, {0});
+    }
+    else
+    {
+      throw std::runtime_error("Unsupported quantization type");
+    }
+  }
+
+  ~SubsequentConcatGraph()
+  {
+    concat_node.values(0, nullptr);
+    concat_node.values(1, nullptr);
+  }
+
+public:
+  luci::CircleConcatenation concat_node{2};
+  luci::CircleConcatenation input_1{2};
+  luci::CircleConv2D input_2;
+};
+
+class ConstInputConcatGraph
+{
+public:
+  ConstInputConcatGraph(loco::DataType quant_type)
+  {
+    concat_node.dtype(quant_type);
+    concat_node.fusedActivationFunction(luci::FusedActFunc::NONE);
+    input_1.dtype(loco::DataType::FLOAT32);
+    input_1.size<loco::DataType::FLOAT32>(5);
+    for (int i = 0; i < 5; i++)
+    {
+      // Set data {-2, -1, 0, 1, 2}
+      input_1.at<loco::DataType::FLOAT32>(i) = i - 2.0;
+    }
+
+    input_2.dtype(quant_type);
+
+    concat_node.values(0, &input_1);
+    concat_node.values(1, &input_2);
+
+    if (quant_type == loco::DataType::U8)
+    {
+      addQuantParam(concat_node, {0.1}, {10});
+      addQuantParam(input_2, {2.0}, {2});
+    }
+    else if (quant_type == loco::DataType::S16)
+    {
+      addQuantParam(concat_node, {0.1}, {0});
+      addQuantParam(input_2, {2.0}, {0});
+    }
+    else
+    {
+      throw std::runtime_error("Unsupported quantization type");
+    }
+  }
+
+  ~ConstInputConcatGraph()
+  {
+    concat_node.values(0, nullptr);
+    concat_node.values(1, nullptr);
+  }
+
+public:
+  luci::CircleConcatenation concat_node{2};
+  luci::CircleConst input_1;
+  luci::CircleConv2D input_2;
+};
+
+} // namespace
+
+TEST(PropagateConcatenationQparam, propagate_concat_quantparam_u8)
+{
+  // Check cases where qparam of concat_node is propagated
+  // (1) normal case: qparam is propagated to input_1 and input_2
+  // (2) input used by other Op: input_1 is an input of input_2. qparam is propagated only to
+  // input_2
+  // (3) subsequent concat: input_1 is concat. qparam is propagated only to input_2
+  // (4) const input: input_1 is const. constant values are quantized
+
+  // normal case: qparam of concat_node is propagated to input_1 and input_2
+  SimpleConcatGraph g(loco::DataType::U8);
+  luci::propagate_concat_quantparam(&g.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(3.14, g.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(77, g.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, g.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(77, g.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, g.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(77, g.input_2.quantparam()->zerop[0]);
+
+  // input_1 is an input of input_2. qparam is propagated only to input_2
+  SimpleConcatGraph g2(loco::DataType::U8);
+  g2.input_2.input(&g2.input_1);
+  luci::propagate_concat_quantparam(&g2.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(3.14, g2.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(77, g2.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(1.0, g2.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(1, g2.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, g2.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(77, g2.input_2.quantparam()->zerop[0]);
+
+  // input_1 is concat. qparam is propagated only to input_2
+  SubsequentConcatGraph sg(loco::DataType::U8);
+  luci::propagate_concat_quantparam(&sg.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(3.14, sg.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(77, sg.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(1.0, sg.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(1, sg.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, sg.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(77, sg.input_2.quantparam()->zerop[0]);
+
+  // input_1 is const. const values are quantized with the qparam of concat
+  ConstInputConcatGraph cg(loco::DataType::U8);
+  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.1, cg.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.1, cg.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.input_2.quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::U8, cg.input_1.dtype());
+  EXPECT_EQ(0, cg.input_1.at<loco::DataType::U8>(0));
+  EXPECT_EQ(0, cg.input_1.at<loco::DataType::U8>(1));
+  EXPECT_EQ(10, cg.input_1.at<loco::DataType::U8>(2));
+  EXPECT_EQ(20, cg.input_1.at<loco::DataType::U8>(3));
+  EXPECT_EQ(30, cg.input_1.at<loco::DataType::U8>(4));
+}
+
+TEST(PropagateConcatenationQparam, propagate_concat_quantparam_u8_NEG)
+{
+  // Check negative cases where qparam is not propagated
+  // (1) concat has fused activation function
+  // (2) concat has fused activation function and input is const
+
+  SimpleConcatGraph g(loco::DataType::U8);
+
+  // concat has fused activation function
+  g.concat_node.fusedActivationFunction(luci::FusedActFunc::RELU);
+  luci::propagate_concat_quantparam(&g.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(3.14, g.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(77, g.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(1.0, g.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(1, g.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(2.0, g.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(2, g.input_2.quantparam()->zerop[0]);
+  g.concat_node.fusedActivationFunction(luci::FusedActFunc::NONE);
+
+  // concat has fused activation function and input_1 is const.
+  // const values are quantized using its min/max
+  ConstInputConcatGraph cg(loco::DataType::U8);
+  cg.concat_node.fusedActivationFunction(luci::FusedActFunc::RELU);
+  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.015686275, cg.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(128, cg.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(2.0, cg.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(2, cg.input_2.quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::U8, cg.input_1.dtype());
+  EXPECT_EQ(quantize(-2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(0));
+  EXPECT_EQ(quantize(-1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(1));
+  EXPECT_EQ(quantize(0, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(2));
+  EXPECT_EQ(quantize(1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(3));
+  EXPECT_EQ(quantize(2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(4));
+}
+
+TEST(PropagateConcatenationQparam, propagate_concat_quantparam_i16)
+{
+  // Check cases where qparam of concat_node is propagated
+  // (1) normal case: qparam is propagated to input_1 and input_2
+  // (2) input used by other Op: input_1 is an input of input_2. qparam is propagated only to
+  // input_2
+  // (3) subsequent concat: input_1 is concat. qparam is propagated only to input_2
+  // (4) const input: input_1 is const. constant values are quantized
+
+  // normal case: qparam of concat_node is propagated to input_1 and input_2
+  SimpleConcatGraph g(loco::DataType::S16);
+  luci::propagate_concat_quantparam(&g.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(3.14, g.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(0, g.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, g.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(0, g.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, g.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(0, g.input_2.quantparam()->zerop[0]);
+
+  // input_1 is an input of input_2. qparam is propagated only to input_2
+  SimpleConcatGraph g2(loco::DataType::S16);
+  g2.input_2.input(&g2.input_1);
+  luci::propagate_concat_quantparam(&g2.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(3.14, g2.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(0, g2.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(1.0, g2.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(0, g2.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, g2.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(0, g2.input_2.quantparam()->zerop[0]);
+
+  // input_1 is concat. qparam is propagated only to input_2
+  SubsequentConcatGraph sg(loco::DataType::S16);
+  luci::propagate_concat_quantparam(&sg.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(3.14, sg.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(0, sg.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(1.0, sg.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(0, sg.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(3.14, sg.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(0, sg.input_2.quantparam()->zerop[0]);
+
+  // input_1 is const. const values are quantized with the qparam of concat
+  ConstInputConcatGraph cg(loco::DataType::S16);
+  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.1, cg.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.1, cg.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.input_2.quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::S16, cg.input_1.dtype());
+  EXPECT_EQ(-20, cg.input_1.at<loco::DataType::S16>(0));
+  EXPECT_EQ(-10, cg.input_1.at<loco::DataType::S16>(1));
+  EXPECT_EQ(0, cg.input_1.at<loco::DataType::S16>(2));
+  EXPECT_EQ(10, cg.input_1.at<loco::DataType::S16>(3));
+  EXPECT_EQ(20, cg.input_1.at<loco::DataType::S16>(4));
+}
+
+TEST(PropagateConcatenationQparam, propagate_concat_quantparam_i16_NEG)
+{
+  // Check negative cases where qparam is not propagated
+  // (1) concat has fused activation function
+  // (2) concat has fused activation function and input is const
+
+  SimpleConcatGraph g(loco::DataType::S16);
+
+  // concat has fused activation function
+  g.concat_node.fusedActivationFunction(luci::FusedActFunc::RELU);
+  luci::propagate_concat_quantparam(&g.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(3.14, g.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(0, g.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(1.0, g.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(0, g.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(2.0, g.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(0, g.input_2.quantparam()->zerop[0]);
+  g.concat_node.fusedActivationFunction(luci::FusedActFunc::NONE);
+
+  // concat has fused activation function and input_1 is const.
+  // const values are quantized using its min/max
+  ConstInputConcatGraph cg(loco::DataType::S16);
+  cg.concat_node.fusedActivationFunction(luci::FusedActFunc::RELU);
+  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.concat_node.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.000061037, cg.input_1.quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.input_1.quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(2.0, cg.input_2.quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.input_2.quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::S16, cg.input_1.dtype());
+  EXPECT_EQ(quantize(-2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(0));
+  EXPECT_EQ(quantize(-1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(1));
+  EXPECT_EQ(quantize(0, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(2));
+  EXPECT_EQ(quantize(1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(3));
+  EXPECT_EQ(quantize(2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(4));
+}
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
index e18690605..9af52a4c4 100644
--- a/compiler/luci/pass/src/QuantizationUtils.cpp
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -31,6 +31,66 @@ uint8_t fp32_to_uint8_cast(float f)
   return static_cast<uint8_t>(f);
 }
 
+// Per-layer quantization of weights (const tensor) using given min/max values
+void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
+                                             float &scaling_factor, int64_t &zp, float &nudged_min,
+                                             float &nudged_max)
+{
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = 255;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
+  const float scaling_factor_inv = 1.0 / scaling_factor;
+  std::vector<int32_t> quantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    // clipping
+    auto data = node->at<loco::DataType::FLOAT32>(i);
+    data = data < nudged_min ? nudged_min : data;
+    data = data > nudged_max ? nudged_max : data;
+    quantized_values[i] =
+        static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv));
+  }
+
+  node->dtype(loco::DataType::U8);      // change the type of tensor
+  node->size<loco::DataType::U8>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+// Per-layer quantization of weights (const tensor) using given min/max values
+void symmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
+                                            float &scaling_factor, int64_t &zp, float &nudged_min,
+                                            float &nudged_max)
+{
+  const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
+  const int32_t kMinScale = -kMaxScale;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  compute_sym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
+  const float scaling_factor_inv = 1.0 / scaling_factor;
+  std::vector<int32_t> quantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    // clipping
+    auto data = node->at<loco::DataType::FLOAT32>(i);
+    data = data < nudged_min ? nudged_min : data;
+    data = data > nudged_max ? nudged_max : data;
+    quantized_values[i] = static_cast<int32_t>(std::round(data * scaling_factor_inv));
+  }
+
+  node->dtype(loco::DataType::S16);      // change the type of tensor
+  node->size<loco::DataType::S16>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::S16>(i) =
+        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
 void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
                           float &nudged_min, float &nudged_max)
 {
diff --git a/compiler/luci/pass/src/QuantizationUtils.h b/compiler/luci/pass/src/QuantizationUtils.h
index ec0e86df8..f766bd66d 100644
--- a/compiler/luci/pass/src/QuantizationUtils.h
+++ b/compiler/luci/pass/src/QuantizationUtils.h
@@ -29,10 +29,20 @@ void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &
 void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
                            float &nudged_min, float &nudged_max);
 
+void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
+                                             float &scaling_factor, int64_t &zp, float &nudged_min,
+                                             float &nudged_max);
+
+void symmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
+                                            float &scaling_factor, int64_t &zp, float &nudged_min,
+                                            float &nudged_max);
+
 bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index);
 
 uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices);
 
+void propagate_concat_quantparam(luci::CircleConcatenation *concat, loco::DataType quant_type);
+
 } // namespace luci
 
 #endif // __LUCI_QUANTIZATION_UTILS_H__
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
index c492234c7..e9925c7ff 100644
--- a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
@@ -284,36 +284,6 @@ void asymmetric_wdequant_per_channel(CircleConst *node, std::vector<float> &scal
   }
 }
 
-void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
-                                             float &scaling_factor, int64_t &zp, float &nudged_min,
-                                             float &nudged_max)
-{
-
-  const int32_t kMinScale = 0;
-  const int32_t kMaxScale = 255;
-
-  uint32_t size = node->size<loco::DataType::FLOAT32>();
-  compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
-  const float scaling_factor_inv = 1.0 / scaling_factor;
-  std::vector<int32_t> quantized_values(size);
-  for (uint32_t i = 0; i < size; ++i)
-  {
-    // clipping
-    auto data = node->at<loco::DataType::FLOAT32>(i);
-    data = data < nudged_min ? nudged_min : data;
-    data = data > nudged_max ? nudged_max : data;
-    quantized_values[i] =
-        static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv));
-  }
-
-  node->dtype(loco::DataType::U8);      // change the type of tensor
-  node->size<loco::DataType::U8>(size); // resize tensor
-  for (uint32_t i = 0; i < size; ++i)
-  {
-    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
-  }
-}
-
 void asymmetric_wdequant_with_minmax_per_layer(CircleConst *node, float scaling_factor,
                                                float nudged_min)
 {
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index 60c1cdd72..564e814f9 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -32,7 +32,99 @@ namespace luci
 namespace
 {
 
-// Check if the node is the bias of Conv2D, DepthwiseConv2D, or FullyConnected layer
+void overwrite_quantparam(luci::CircleConcatenation *concat, luci::CircleNode *target)
+{
+  auto concat_qparam = concat->quantparam();
+  if (concat_qparam == nullptr)
+    throw std::runtime_error("quantparam of concat is not found during overwrite");
+
+  auto target_qparam = target->quantparam();
+  if (target_qparam == nullptr)
+  {
+    auto quantparam = std::make_unique<CircleQuantParam>();
+    target->quantparam(std::move(quantparam));
+    target_qparam = target->quantparam();
+  }
+  target_qparam->min = concat_qparam->min;
+  target_qparam->max = concat_qparam->max;
+  target_qparam->scale = concat_qparam->scale;
+  target_qparam->zerop = concat_qparam->zerop;
+  target_qparam->quantized_dimension = concat_qparam->quantized_dimension;
+}
+
+void quant_const_values(luci::CircleConst *const_node, float scaling_factor, float zerop,
+                        loco::DataType quant_type)
+{
+  uint32_t size = const_node->size<loco::DataType::FLOAT32>();
+
+  const float scaling_factor_inv = 1.0 / scaling_factor;
+  std::vector<int32_t> quantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    auto data = const_node->at<loco::DataType::FLOAT32>(i);
+    quantized_values[i] = static_cast<int32_t>(std::round(data * scaling_factor_inv) + zerop);
+  }
+
+  switch (quant_type)
+  {
+    case loco::DataType::U8:
+      const_node->dtype(loco::DataType::U8);      // change the type of tensor
+      const_node->size<loco::DataType::U8>(size); // resize tensor
+      for (uint32_t i = 0; i < size; ++i)
+        const_node->at<loco::DataType::U8>(i) = std::min(255, std::max(0, quantized_values[i]));
+      break;
+    case loco::DataType::S16:
+      assert(zerop == 0);
+      const_node->dtype(loco::DataType::S16);      // change the type of tensor
+      const_node->size<loco::DataType::S16>(size); // resize tensor
+      for (uint32_t i = 0; i < size; ++i)
+        const_node->at<loco::DataType::S16>(i) =
+            std::min(32767, std::max(-32767, quantized_values[i]));
+      break;
+    default:
+      throw std::runtime_error("Unsupported data type");
+  }
+}
+
+void quant_const(CircleConst *node, loco::DataType quant_type)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+
+  float min = std::numeric_limits<float>::max();
+  float max = std::numeric_limits<float>::lowest();
+  for (uint32_t i = 0; i < node->size<loco::DataType::FLOAT32>(); i++)
+  {
+    auto data = node->at<loco::DataType::FLOAT32>(i);
+    min = data < min ? data : min;
+    max = data > max ? data : max;
+  }
+
+  float scaling_factor{0.0};
+  int64_t zp{0};
+  float nudged_min{0.0};
+  float nudged_max{0.0};
+
+  switch (quant_type)
+  {
+    case loco::DataType::U8:
+      asymmetric_wquant_with_minmax_per_layer(node, min, max, scaling_factor, zp, nudged_min,
+                                              nudged_max);
+      break;
+    case loco::DataType::S16:
+      symmetric_wquant_with_minmax_per_layer(node, min, max, scaling_factor, zp, nudged_min,
+                                             nudged_max);
+      break;
+    default:
+      throw std::runtime_error("Unsupported data type");
+  }
+
+  auto quantparam = std::make_unique<CircleQuantParam>();
+  quantparam->scale.push_back(scaling_factor);
+  quantparam->zerop.push_back(zp);
+  node->quantparam(std::move(quantparam));
+}
+
+// Check if the node is the bias of Conv2D, DepthwiseConv2D, FullyConnected, or TransposeConv layer
 // If true, return <input, weight> pair of the successor node (used to quantize bias)
 // If flase, return <nullptr, nullptr>
 std::pair<loco::Node *, loco::Node *> get_input_weight_of_bias(CircleNode *node)
@@ -68,6 +160,13 @@ std::pair<loco::Node *, loco::Node *> get_input_weight_of_bias(CircleNode *node)
       assert(fc->weights() != nullptr);
       return std::make_pair(fc->input(), fc->weights());
     }
+    auto tconv = dynamic_cast<CircleTransposeConv *>(out);
+    if (tconv != nullptr && tconv->bias() == circle_const)
+    {
+      assert(tconv->outBackprop() != nullptr);
+      assert(tconv->filter() != nullptr);
+      return std::make_pair(tconv->outBackprop(), tconv->filter());
+    }
   }
   return std::make_pair(nullptr, nullptr);
 }
@@ -514,8 +613,171 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
   }
 };
 
+/**
+ * @brief Quantize const input tensors using min/max of const values
+ */
+void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type)
+{
+  auto opcode = node->opcode();
+  auto arity = node->arity();
+
+  loco::Node *input_node{nullptr};
+  luci::CircleConst *const_node{nullptr};
+
+  switch (opcode)
+  {
+    case luci::CircleOpcode::CONV_2D:
+    case luci::CircleOpcode::DEPTHWISE_CONV_2D:
+    case luci::CircleOpcode::FULLY_CONNECTED:
+    case luci::CircleOpcode::TRANSPOSE_CONV:
+      // Handled in QuantizeWeights and QuantizeBias
+      break;
+
+    case luci::CircleOpcode::CONCATENATION:
+      // Handled in propagate_concat_quantparam
+      break;
+
+    case luci::CircleOpcode::ARG_MAX:
+    case luci::CircleOpcode::ARG_MIN:
+    case luci::CircleOpcode::MEAN:
+    case luci::CircleOpcode::PAD:
+    case luci::CircleOpcode::REDUCE_ANY:
+    case luci::CircleOpcode::REDUCE_PROD:
+    case luci::CircleOpcode::REDUCE_MAX:
+    case luci::CircleOpcode::REDUCE_MIN:
+    case luci::CircleOpcode::RESHAPE:
+    case luci::CircleOpcode::SUM:
+      // The second input of these Ops should not be quantized
+      // Ex: axis, paddings
+      input_node = node->arg(0);
+      const_node = dynamic_cast<luci::CircleConst *>(input_node);
+      if (const_node != nullptr)
+        quant_const(const_node, output_type);
+      break;
+
+    case luci::CircleOpcode::ADD:
+    case luci::CircleOpcode::ADD_N:
+    case luci::CircleOpcode::DIV:
+    case luci::CircleOpcode::EQUAL:
+    case luci::CircleOpcode::GREATER:
+    case luci::CircleOpcode::GREATER_EQUAL:
+    case luci::CircleOpcode::LESS:
+    case luci::CircleOpcode::LESS_EQUAL:
+    case luci::CircleOpcode::MAXIMUM:
+    case luci::CircleOpcode::MINIMUM:
+    case luci::CircleOpcode::MUL:
+    case luci::CircleOpcode::NOT_EQUAL:
+    case luci::CircleOpcode::PRELU:
+    case luci::CircleOpcode::SUB:
+      // Quantize all const inputs using their values
+      for (uint32_t i = 0; i < arity; i++)
+      {
+        input_node = node->arg(i);
+        const_node = dynamic_cast<luci::CircleConst *>(input_node);
+        if (const_node != nullptr)
+          quant_const(const_node, output_type);
+      }
+      break;
+
+    default:
+      for (uint32_t i = 0; i < arity; i++)
+      {
+        input_node = node->arg(i);
+        const_node = dynamic_cast<luci::CircleConst *>(input_node);
+        if (const_node != nullptr)
+          throw std::runtime_error("Unsupported Op for const inputs");
+      }
+      break;
+  }
+}
+
 } // namespace
 
+/** BEFORE
+ *
+ *         [CircleNode]             [CircleConst]
+ *         (U8 qparam1)                 (FP32)
+ *                   \                    /
+ *                    \                  /
+ *                    [CircleConcatenation]
+ *                        (U8 qparam2)
+ *
+ *  AFTER
+ *         [CircleNode]             [CircleConst]
+ *         (U8 qparam2)             (U8 qparam2)
+ *                   \                    /
+ *                    \                  /
+ *                    [CircleConcatenation]
+ *                        (U8 qparam2)
+ */
+void propagate_concat_quantparam(luci::CircleConcatenation *concat, loco::DataType quant_type)
+{
+  assert(concat->quantparam() != nullptr);
+
+  const auto num_inputs = concat->numValues();
+
+  // Quantize const inputs using their values if concat has fused act function
+  if (concat->fusedActivationFunction() != luci::FusedActFunc::NONE)
+  {
+    for (uint32_t i = 0; i < num_inputs; i++)
+    {
+      auto node = concat->arg(i);
+      auto const_node = dynamic_cast<luci::CircleConst *>(node);
+      if (const_node != nullptr)
+        quant_const(const_node, quant_type);
+    }
+    return;
+  }
+
+  for (uint32_t i = 0; i < num_inputs; i++)
+  {
+    auto node = loco::must_cast<luci::CircleNode *>(concat->arg(i));
+
+    // Skip if this input is CONCAT Op
+    if (node->opcode() == luci::CircleOpcode::CONCATENATION)
+      continue;
+
+    // Skip if this input is used by other Ops
+    auto succs = loco::succs(node);
+    if (succs.size() != 1)
+    {
+      if (node->opcode() == luci::CircleOpcode::CIRCLECONST)
+      {
+        luci::CircleConst *const_node = loco::must_cast<luci::CircleConst *>(node);
+        quant_const(const_node, quant_type);
+      }
+      continue;
+    }
+
+    assert(succs.find(concat) != succs.end());
+
+    // Quantize constant values
+    if (node->opcode() == luci::CircleOpcode::CIRCLECONST)
+    {
+      luci::CircleConst *const_node = loco::must_cast<luci::CircleConst *>(node);
+      if (const_node->dtype() != loco::DataType::FLOAT32)
+        throw std::runtime_error("Unsupported data type for constant input of concatenation Op");
+
+      const auto concat_qparam = concat->quantparam();
+      if (concat_qparam == nullptr)
+        throw std::runtime_error("quantparam of concat is not found during propagation");
+
+      assert(concat_qparam->scale.size() == 1);
+      const auto scaling_factor = concat_qparam->scale[0];
+      const auto zerop = concat_qparam->zerop[0];
+
+      quant_const_values(const_node, scaling_factor, zerop, quant_type);
+    }
+    else
+    {
+      // Non-const input must have been quantized
+      assert(node->quantparam() != nullptr);
+    }
+
+    overwrite_quantparam(concat, node);
+  }
+}
+
 bool QuantizeWithMinMaxPass::run(loco::Graph *g)
 {
   LOGGER(l);
@@ -538,11 +800,37 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   }
 
   // Quantize bias
+  // (For int16 quantization, bias is not quantized)
+  if (_output_dtype == loco::DataType::U8)
+  {
+    for (auto node : loco::active_nodes(loco::output_nodes(g)))
+    {
+      QuantizeBias qb(_input_dtype, _output_dtype, _granularity);
+      auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+      circle_node->accept(&qb);
+    }
+  }
+
+  // Quantize const inputs other than weights and bias
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    QuantizeBias qb(_input_dtype, _output_dtype, _granularity);
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    circle_node->accept(&qb);
+    quantize_const_inputs(circle_node, _output_dtype);
+  }
+
+  // Propagate quantization parameters of concat Op
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto concat = dynamic_cast<luci::CircleConcatenation *>(node);
+    if (not concat)
+      continue;
+
+    // Propagate qparam of concat to its inputs if
+    // (1) concat is uint8-quantized
+    // (2) concat has no fused activation function
+    // (3) the input is not concatenation Op
+    // (4) the input is not produced to Ops other than concat
+    propagate_concat_quantparam(concat, _output_dtype);
   }
 
   // Update output dtype
diff --git a/compiler/luci/pass/src/RequantizePass.cpp b/compiler/luci/pass/src/RequantizePass.cpp
index 49fbf76ec..fe84e3bc3 100644
--- a/compiler/luci/pass/src/RequantizePass.cpp
+++ b/compiler/luci/pass/src/RequantizePass.cpp
@@ -56,7 +56,9 @@ bool is_bias(CircleConst *node)
     if (fc != nullptr && fc->bias() == node)
       return true;
 
-    // TODO: add TransposeConv when bias is supported in CircleTransposeConv
+    auto tconv = dynamic_cast<CircleTransposeConv *>(out);
+    if (tconv != nullptr && tconv->bias() == node)
+      return true;
   }
   return false;
 }
diff --git a/compiler/luci/pass/src/Sparsifier.cpp b/compiler/luci/pass/src/Sparsifier.cpp
new file mode 100644
index 000000000..2aa542f15
--- /dev/null
+++ b/compiler/luci/pass/src/Sparsifier.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Sparsifier.h"
+
+namespace luci
+{
+
+template <typename T>
+Sparsifier<T>::Sparsifier(const std::vector<int32_t> &shape,
+                          const std::vector<int32_t> &traversal_order,
+                          const std::vector<DimensionType> &format,
+                          const std::vector<int32_t> &block_size,
+                          const std::vector<int32_t> &block_map)
+    : _dense_shape(shape), _traversal_order(traversal_order), _block_size(block_size),
+      _block_map(block_map)
+{
+  _dense_size = 1;
+  int32_t block_dim = 0;
+  _blocked_shape.resize(shape.size());
+  _format.resize(shape.size() + block_map.size());
+  for (int32_t i = 0; i < static_cast<int32_t>(shape.size()); i++)
+  {
+    _format[i] = format[traversal_order[i]];
+    _dense_size *= shape[i];
+    if (block_dim < static_cast<int32_t>(block_map.size()) && block_map[block_dim] == i)
+    {
+      _blocked_shape[i] = shape[i] / block_size[block_dim];
+      block_dim++;
+    }
+    else
+    {
+      _blocked_shape[i] = shape[i];
+    }
+  }
+
+  // Only dense blocks are supported.
+  for (uint32_t i = 0; i < block_map.size(); i++)
+  {
+    _format[i + shape.size()] = DimensionType::DENSE;
+  }
+}
+
+template <typename T> void Sparsifier<T>::DenseToSparse(const T *src_data)
+{
+  int num_original_dims = _dense_shape.size();
+  int num_block_dims = _block_map.size();
+  int num_expanded_dims = num_original_dims + num_block_dims;
+  std::vector<int> expanded_shape(num_expanded_dims);
+  for (int i = 0; i < num_expanded_dims; i++)
+  {
+    if (i < num_original_dims)
+    {
+      expanded_shape[i] = _blocked_shape[i];
+    }
+    else
+    {
+      expanded_shape[i] = _block_size[i - num_original_dims];
+    }
+  }
+
+  std::vector<int> shape_offset(num_original_dims);
+  shape_offset[shape_offset.size() - 1] = 1;
+  for (int i = num_original_dims - 1; i > 0; --i)
+  {
+    shape_offset[i - 1] = shape_offset[i] * _dense_shape[i];
+  }
+
+  std::vector<int> expanded_shape_offset(num_expanded_dims);
+  for (int i = 0; i < num_original_dims; ++i)
+  {
+    expanded_shape_offset[i] = shape_offset[i];
+  }
+  for (int i = 0; i < num_block_dims; ++i)
+  {
+    int mapped_dim = _block_map[i];
+    expanded_shape_offset[num_original_dims + i] = shape_offset[mapped_dim];
+    expanded_shape_offset[mapped_dim] *= _block_size[i];
+  }
+
+  std::vector<int> dst_ordered_offset(num_expanded_dims);
+  for (int i = 0; i < num_expanded_dims; ++i)
+  {
+    dst_ordered_offset[i] = expanded_shape_offset[_traversal_order[i]];
+  }
+
+  std::vector<bool> dst_dim_has_nonzeroes(num_expanded_dims);
+  std::fill(dst_dim_has_nonzeroes.begin(), dst_dim_has_nonzeroes.end(), false);
+  std::vector<int> inner_compressed_dim(num_expanded_dims);
+  int most_recent_compressed_dim = -1;
+  std::vector<int> num_segments_of_next_compressed_dim(num_expanded_dims);
+  int segment_count = 1;
+  for (int i = num_expanded_dims - 1; i >= 0; --i)
+  {
+    inner_compressed_dim[i] = most_recent_compressed_dim;
+    if (_format[i] == DimensionType::SPARSE_CSR)
+    {
+      most_recent_compressed_dim = i;
+      num_segments_of_next_compressed_dim[i] = segment_count;
+      segment_count = 1;
+    }
+    else
+    {
+      num_segments_of_next_compressed_dim[i] = -1;
+      segment_count *= expanded_shape[_traversal_order[i]];
+    }
+  }
+
+  _dim_metadata.resize(num_expanded_dims * 2);
+  std::vector<int> dst_sparse_dims;
+  dst_sparse_dims.reserve(num_expanded_dims);
+  for (int i = 0; i < num_expanded_dims; ++i)
+  {
+    _dim_metadata[i * 2].clear();
+    _dim_metadata[i * 2 + 1].clear();
+    if (_format[i] == DimensionType::DENSE)
+    {
+      // If dimension is dense, just store the shape.
+      _dim_metadata[i * 2].push_back(expanded_shape[_traversal_order[i]]);
+    }
+    else
+    {
+      _dim_metadata[i * 2].push_back(0); // Segment array always begins with 0.
+      dst_sparse_dims.push_back(i);      // Add dimension to the sparse list.
+    }
+  }
+
+  // This algorithm assumes that the block size is small enough for all the
+  // elements to fit in cache, so the strided accesses from different traversal
+  // order and the write-first-erase-later strategy shouldn't be too slow
+  int dst_dim_idx = num_expanded_dims;
+  std::vector<int> coordinate(num_expanded_dims, 0);
+  int dense_tensor_idx = 0;
+  while (dst_dim_idx >= 0)
+  {
+    if (dst_dim_idx == num_expanded_dims)
+    {
+      // We have a complete coordinate. Add the element to the value array if it
+      // is not zero, or if the last dimension is dense.
+      if (!IsZero(src_data[dense_tensor_idx]))
+      {
+        _data.push_back(src_data[dense_tensor_idx]);
+        // Mark all sparse dimensions that their current indices have nonzeroes.
+        for (auto dst_dim : dst_sparse_dims)
+        {
+          if (!dst_dim_has_nonzeroes[dst_dim])
+          {
+            // Only add the index to the indices array if the current nonzero
+            // is the first nonzero of the block.
+            _dim_metadata[2 * dst_dim + 1].push_back(coordinate[dst_dim]);
+            dst_dim_has_nonzeroes[dst_dim] = true;
+          }
+        }
+      }
+      else if (_format[num_expanded_dims - 1] == DimensionType::DENSE)
+      {
+        _data.push_back(src_data[dense_tensor_idx]);
+      }
+      --dst_dim_idx;
+    }
+    else
+    {
+      int original_dim_idx = _traversal_order[dst_dim_idx];
+      int dim_size = expanded_shape[original_dim_idx];
+      if (dst_dim_has_nonzeroes[dst_dim_idx])
+      {
+        // If the previous block has nonzeroes, reset the flag to false since
+        // we have just moved to a new block.
+        dst_dim_has_nonzeroes[dst_dim_idx] = false;
+      }
+      else if (_format[dst_dim_idx] == DimensionType::SPARSE_CSR)
+      {
+        // This block is empty. Delete unnecessary values if compressed.
+        int next_compressed_dim = inner_compressed_dim[dst_dim_idx];
+        int erase_offset = _dim_metadata[2 * dst_dim_idx + 1].size() *
+                           num_segments_of_next_compressed_dim[dst_dim_idx];
+        if (next_compressed_dim >= 0)
+        {
+          auto &segments = _dim_metadata[2 * inner_compressed_dim[dst_dim_idx]];
+          segments.erase(segments.begin() + 1 + erase_offset, segments.end());
+        }
+        else
+        {
+          _data.erase(_data.begin() + erase_offset, _data.end());
+        }
+      }
+      if (++coordinate[dst_dim_idx] < dim_size)
+      {
+        // The current dst_dim_idx is valid (not out of bound).
+        dense_tensor_idx += dst_ordered_offset[dst_dim_idx];
+        ++dst_dim_idx;
+      }
+      else
+      {
+        // dst_dim_idx has reached its dim size. Update segment array and go
+        // back to incrementing the previous dimension (dst_dim_idx - 1).
+        if (_format[dst_dim_idx] == DimensionType::SPARSE_CSR)
+        {
+          _dim_metadata[2 * dst_dim_idx].push_back(_dim_metadata[2 * dst_dim_idx + 1].size());
+        }
+        coordinate[dst_dim_idx] = -1;
+        dense_tensor_idx -= dst_ordered_offset[dst_dim_idx] * dim_size;
+        --dst_dim_idx;
+      }
+    }
+  }
+}
+
+template <typename T> bool Sparsifier<T>::IsZero(const T val) { return (val == 0); }
+
+template class Sparsifier<int32_t>;
+template class Sparsifier<int8_t>;
+template class Sparsifier<float>;
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/Sparsifier.h b/compiler/luci/pass/src/Sparsifier.h
new file mode 100644
index 000000000..71ea28da9
--- /dev/null
+++ b/compiler/luci/pass/src/Sparsifier.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SPARSIFIER_H__
+#define __LUCI_SPARSIFIER_H__
+
+#include <vector>
+
+#include <luci/IR/SparsityParam.h>
+
+namespace luci
+{
+
+template <typename T> class Sparsifier
+{
+public:
+  /*
+   * Creates a dense to sparse converter.
+   * @param shape             Shape of the dense tensor.
+   * @param traversal_order   In what order to traverse all dimensions,
+   *                          including block dimensions.
+   * @param format            Whether each dimension in converted tensor is
+   *                          dense or sparse (not in the traversal order).
+   * @param block_size        Size of each block dimension.
+   * @param block_map         Map from block dimension to original tensor
+   *                          dimension.
+   */
+  Sparsifier(const std::vector<int> &shape, const std::vector<int> &traversal_order,
+             const std::vector<DimensionType> &format, const std::vector<int> &block_size = {},
+             const std::vector<int> &block_map = {});
+
+  std::vector<T> GetData() { return _data; }
+  std::vector<std::vector<int>> GetDimMetadata() { return _dim_metadata; }
+
+  void DenseToSparse(const T *src_data);
+
+private:
+  // Check if val is equal to zero.
+  bool IsZero(const T val);
+
+  // Shape of the conceptual dense tensor.
+  std::vector<int> _dense_shape;
+  // Shape of the dense tensor with inner blocks reduced. For example, a (4, 4)
+  // tensor with (2, 2) block has blocked_shape (2, 2).
+  std::vector<int> _blocked_shape;
+  // Total number of elements in the dense tensor.
+  uint64_t _dense_size;
+  // Has n(original dimension)+k(block_dimension) elements.
+  std::vector<int> _traversal_order;
+  // Format of each dimension in the traversal order.
+  std::vector<DimensionType> _format;
+  // Size of each block dimension, in the same order as block map.
+  std::vector<int> _block_size;
+  // Map from block dimension to the original tensor dimension.
+  std::vector<int> _block_map;
+  // Metadata of each dimension in the traversal order.
+  // Each dimension needs two vectors. For dense dimensions, the first vector
+  // stores the size of that dimension, and the second vector is empty. For
+  // sparse dimensions, the first vector stores the segments and the second one
+  // stores the indices.
+  std::vector<std::vector<int>> _dim_metadata;
+  // Actual buffer holding data after conversion. Could be sparse buffer or
+  // dense buffer.
+  std::vector<T> _data;
+};
+
+extern template class Sparsifier<int32_t>;
+extern template class Sparsifier<int8_t>;
+extern template class Sparsifier<float>;
+
+} // namespace luci
+
+#endif // __LUCI_SPARSIFIER_H__
diff --git a/compiler/luci/pass/src/SparsifyTensorPass.cpp b/compiler/luci/pass/src/SparsifyTensorPass.cpp
new file mode 100644
index 000000000..2f1a36e77
--- /dev/null
+++ b/compiler/luci/pass/src/SparsifyTensorPass.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SparsifyTensorPass.h"
+
+#include "Sparsifier.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace luci
+{
+
+template <loco::DataType DT> void SparsifyTensorPass::sparsify_tensor(luci::CircleConst *cop)
+{
+  using PRIMITIVE_DTYPE = typename loco::DataTypeImpl<DT>::Type;
+
+  std::vector<int32_t> dense_tensor_shape(cop->rank());
+  for (uint32_t d = 0; d < cop->rank(); d++)
+  {
+    dense_tensor_shape.at(d) = cop->dim(d).value();
+  }
+
+  Sparsifier<PRIMITIVE_DTYPE> sparsifier(dense_tensor_shape, _traversal_order, _format, _block_size,
+                                         _block_map);
+  // get dense tensor data
+  uint32_t dense_tensor_data_size = cop->size<DT>();
+  std::vector<PRIMITIVE_DTYPE> dense_tensor_data(dense_tensor_data_size);
+  for (uint32_t i = 0; i < dense_tensor_data_size; i++)
+  {
+    dense_tensor_data.at(i) = cop->at<DT>(i);
+  }
+  // sparsify
+  sparsifier.DenseToSparse(dense_tensor_data.data());
+  // get sparse tensor data
+  std::vector<PRIMITIVE_DTYPE> sparse_tensor_data = sparsifier.GetData();
+  uint32_t sparse_tensor_data_size = sparse_tensor_data.size();
+  cop->size<DT>(sparse_tensor_data_size);
+  for (uint32_t i = 0; i < sparse_tensor_data_size; i++)
+  {
+    cop->at<DT>(i) = sparse_tensor_data.at(i);
+  }
+  // make sparsity parameter
+  auto sparsityparam = std::make_unique<SparsityParam>();
+  sparsityparam->traversal_order = _traversal_order;
+  sparsityparam->block_map = _block_map;
+  // get dimension meta data
+  const auto dim_metadata = sparsifier.GetDimMetadata();
+  for (uint32_t idx = 0; idx < _format.size(); idx++)
+  {
+    if (_format.at(idx) == DimensionType::DENSE)
+    {
+      sparsityparam->dim_metadata.emplace_back(DimensionType::DENSE,
+                                               dim_metadata.at(idx * 2).at(0));
+    }
+    // TODO Set SparseIndexVectorType according to its data range
+    else if (_format.at(idx) == DimensionType::SPARSE_CSR)
+    {
+      sparsityparam->dim_metadata.emplace_back(
+          DimensionType::SPARSE_CSR, /* dense size */ 0,
+          /* array_segments */ SparseIndexVector{SparseIndexVectorType::U16,
+                                                 dim_metadata.at(idx * 2)},
+          /* array_indices */ SparseIndexVector{SparseIndexVectorType::U16,
+                                                dim_metadata.at(idx * 2 + 1)});
+    }
+  }
+  for (uint32_t i = 0; i < _block_size.size(); i++)
+  {
+    assert(_block_size.at(i) == dim_metadata.at((_format.size() + i) * 2).at(0));
+    sparsityparam->dim_metadata.emplace_back(DimensionType::DENSE, _block_size.at(i));
+  }
+  cop->sparsityparam(std::move(sparsityparam));
+}
+
+bool SparsifyTensorPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto cop = dynamic_cast<luci::CircleConst *>(node);
+    if (not cop)
+      continue;
+
+    if (cop->name() != _tensor_name)
+      continue;
+
+    switch (cop->dtype())
+    {
+      case loco::DataType::S32:
+        sparsify_tensor<loco::DataType::S32>(cop);
+        break;
+      case loco::DataType::S8:
+        sparsify_tensor<loco::DataType::S8>(cop);
+        break;
+      case loco::DataType::FLOAT32:
+        sparsify_tensor<loco::DataType::FLOAT32>(cop);
+        break;
+      default:
+        throw std::runtime_error("SparsifyTensorPass: Unsupported dtype.");
+    }
+    changed = true;
+  }
+
+  return changed;
+}
+
+template void SparsifyTensorPass::sparsify_tensor<loco::DataType::S32>(luci::CircleConst *cop);
+template void SparsifyTensorPass::sparsify_tensor<loco::DataType::S8>(luci::CircleConst *cop);
+template void SparsifyTensorPass::sparsify_tensor<loco::DataType::FLOAT32>(luci::CircleConst *cop);
+
+} // namespace luci
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index db25186b1..a55f50b19 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -1608,6 +1608,22 @@ loco::NodeShape infer_unpack(const luci::CircleUnpack *node)
   return loco::NodeShape{output_shape};
 }
 
+loco::NodeShape infer_unidirectionalsequencelstm(const luci::CircleUnidirectionalSequenceLSTM *node)
+{
+  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto recurrent_to_output_weights =
+      loco::shape_get(node->recurrent_to_output_weights()).as<loco::TensorShape>();
+  auto rank = input_shape.rank();
+  loco::TensorShape output_shape;
+  output_shape.rank(rank);
+  for (uint32_t i = 0; i < rank - 1; i++)
+  {
+    output_shape.dim(i) = input_shape.dim(i);
+  }
+  output_shape.dim(rank - 1) = recurrent_to_output_weights.dim(1);
+  return loco::NodeShape{output_shape};
+}
+
 loco::NodeShape infer_unique(const luci::CircleUnique *node)
 {
   auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
@@ -2047,6 +2063,12 @@ public:
     return infer_depthwise_conv2d(node);
   }
 
+  loco::NodeShape visit(const luci::CircleDequantize *node) final
+  {
+    const auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    return loco::NodeShape{input_shape};
+  }
+
   loco::NodeShape visit(const luci::CircleDiv *node) final { return broadcast_xy(node); }
 
   loco::NodeShape visit(const luci::CircleElu *node) final
@@ -2373,6 +2395,11 @@ public:
 
   loco::NodeShape visit(const luci::CircleUnpack *node) final { return infer_unpack(node); }
 
+  loco::NodeShape visit(const luci::CircleUnidirectionalSequenceLSTM *node) final
+  {
+    return infer_unidirectionalsequencelstm(node);
+  }
+
   loco::NodeShape visit(const luci::CircleUnique *node) final { return infer_unique(node); }
 
   loco::NodeShape visit(const luci::CircleWhere *node) final { return use_own(node); }
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index d28d8ac99..f738ab5a8 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -111,6 +111,8 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return loco::dtype_get(node->input());
   }
 
+  loco::DataType visit(const luci::CircleDequantize *) final { return loco::DataType::FLOAT32; }
+
   loco::DataType visit(const luci::CircleDiv *node) final { return loco::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleElu *node) final
@@ -490,6 +492,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return loco::dtype_get(node->outBackprop());
   }
 
+  loco::DataType visit(const luci::CircleUnidirectionalSequenceLSTM *node) final
+  {
+    return loco::dtype_get(node->input());
+  }
+
   loco::DataType visit(const luci::CircleUnique *node) final
   {
     return loco::dtype_get(node->input());
diff --git a/compiler/luci/service/src/Validate.cpp b/compiler/luci/service/src/Validate.cpp
index 282a068e0..d224fd172 100644
--- a/compiler/luci/service/src/Validate.cpp
+++ b/compiler/luci/service/src/Validate.cpp
@@ -75,6 +75,11 @@ bool validate_shape_dtype(loco::Graph *g)
     assert(circle_output != nullptr);
     assert(circle_output->from() != nullptr);
     auto circle_node = loco::must_cast<luci::CircleNode *>(circle_output->from());
+
+    // Shape and dtype validation for CiecleOutputExclude is not needed
+    if (dynamic_cast<luci::CircleOutputExclude *>(circle_node))
+      continue;
+
     assert(loco::shape_known(circle_node));
 
     // check if output node shape is same as graph output shape
diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
index 12dd7ff5b..897d41983 100644
--- a/compiler/luci/tests/test.lst
+++ b/compiler/luci/tests/test.lst
@@ -42,6 +42,7 @@ addread(DepthwiseConv2D_000)
 addread(DepthwiseConv2D_U8_000)
 addread(DepthwiseConv2D_U8_001)
 addread(DepthwiseConv2D_001)
+addread(Dequantize_000)
 addread(Div_000)
 addread(ELU_000)
 addread(Equal_000)
@@ -180,6 +181,8 @@ addread(TopKV2_000)
 addread(TopKV2_001)
 addread(Transpose_000)
 addread(TransposeConv_000)
+addread(UnidirectionalSequenceLSTM_000)
+addread(UnidirectionalSequenceLSTM_001)
 addread(Unique_000)
 addread(Unique_001)
 addread(Unique_002)
@@ -256,6 +259,7 @@ addwrite(DepthwiseConv2D_000)
 addwrite(DepthwiseConv2D_U8_000)
 addwrite(DepthwiseConv2D_U8_001)
 addwrite(DepthwiseConv2D_001)
+addwrite(Dequantize_000)
 addwrite(Div_000)
 addwrite(ELU_000)
 addwrite(Equal_000)
@@ -393,6 +397,8 @@ addwrite(TopKV2_000)
 addwrite(TopKV2_001)
 addwrite(Transpose_000)
 addwrite(TransposeConv_000)
+addwrite(UnidirectionalSequenceLSTM_000)
+addwrite(UnidirectionalSequenceLSTM_001)
 addwrite(Unique_000)
 addwrite(Unique_001)
 addwrite(Unique_002)
diff --git a/compiler/mio-tf/CMakeLists.txt b/compiler/mio-tf/CMakeLists.txt
index d670f6bab..133d4684a 100644
--- a/compiler/mio-tf/CMakeLists.txt
+++ b/compiler/mio-tf/CMakeLists.txt
@@ -1,6 +1,6 @@
 nnas_find_package(Protobuf QUIET)
 # TensorFlowSource package is used to use ~.proto files
-nnas_find_package(TensorFlowSource EXACT 1.12 QUIET)
+nnas_find_package(TensorFlowSource EXACT 2.3 QUIET)
 
 if(NOT Protobuf_FOUND)
   return()
diff --git a/compiler/one-cmds/CMakeLists.txt b/compiler/one-cmds/CMakeLists.txt
index 173b8b476..a7135d64b 100644
--- a/compiler/one-cmds/CMakeLists.txt
+++ b/compiler/one-cmds/CMakeLists.txt
@@ -43,3 +43,9 @@ foreach(ONE_DOCUMENT IN ITEMS ${ONE_DOCUMENT_FILES})
   install(FILES ${ONE_DOCUMENT} DESTINATION doc)
 
 endforeach(ONE_DOCUMENT)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+add_subdirectory(tests)
diff --git a/compiler/one-cmds/how-to-prepare-virtualenv.txt b/compiler/one-cmds/how-to-prepare-virtualenv.txt
index 62a94968b..f3dcf704b 100644
--- a/compiler/one-cmds/how-to-prepare-virtualenv.txt
+++ b/compiler/one-cmds/how-to-prepare-virtualenv.txt
@@ -1,7 +1,7 @@
 About
 -----
 
-Last update: 2020-08-03
+Last update: 2020-09-15
 
 This document explains about 'one-prepare-venv' command.
 
@@ -20,8 +20,8 @@ $ sudo apt-get upgrade
 $ sudo apt-get install python3-pip python3-venv
 
 
-How to run
-----------
+How to run for Ubuntu
+---------------------
 
 Just run 'one-prepare-venv' command
 
@@ -30,6 +30,23 @@ $ one-prepare-venv
 There will be venv folder as of result.
 
 
+How to run for Windows
+----------------------
+
+1. First, please prepare Python 3.5-3.7
+2. Open the Command Prompt as an administrator
+3. cd(change directory) to the directory where one-compiler is installed
+4. run below command
+```
+$ ONE\install\bin> python -m venv venv
+$ ONE\install\bin> cd venv/Scripts
+$ ONE\install\bin\venv/Scripts> pip.exe install -U pip
+$ ONE\install\bin\venv/Scripts> pip.exe install -U tensorflow-cpu==2.3.0
+```
+
+After running the above command, go back to MinGW and run one-compiler.
+
+
 Trouble shooting
 ----------------
 
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index 0ee69e077..2b903c973 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -37,7 +37,7 @@ This will convert Tensorflow model file (.pb) to our circle model file with appl
 To execute this command, original Tensorflow model file must include BCQ information.
 
 This command invokes following scripts internally.
-- preserve_bcq_info : Prevent BCQ information vanishing problem
+- generate_bcq_metadata : Generate BCQ metadata in the model
 - generate_bcq_info : Designate BCQ information nodes as model output automatically
 - tf2tfliteV2 : Convert Tensorflow model to tflite model
 - tflite2circle : Convert Tensorflow Lite model to circle model
@@ -81,6 +81,8 @@ one-optimize
 one-optimize provides network or operator transformation shown below.
 
 Current transformation options are
+- fold_dequantize : This removes Dequantize operation which can be folded
+- fuse_add_with_tconv: This fuses Add operator with the preceding TConv operator if possible
 - fuse_bcq: This enables Binary-Coded-bases Quantized DNNs
    - read https://arxiv.org/abs/2005.09904 for detailed information
 - fuse_instnorm: This will convert instance normalization related operators to
diff --git a/compiler/one-cmds/one-import-bcq b/compiler/one-cmds/one-import-bcq
index 98dd1efed..7c583071f 100644
--- a/compiler/one-cmds/one-import-bcq
+++ b/compiler/one-cmds/one-import-bcq
@@ -38,6 +38,41 @@ version()
   exit 255
 }
 
+input_not_set()
+{
+  echo "Error: input_path not set"
+  echo ""
+  usage
+}
+
+output_not_set()
+{
+  echo "Error: output_path not set"
+  echo ""
+  usage
+}
+
+input_arrays_not_set()
+{
+  echo "Error: input_arrays not set"
+  echo ""
+  usage
+}
+
+input_shapes_not_set()
+{
+  echo "Error: input_shapes not set"
+  echo ""
+  usage
+}
+
+output_arrays_not_set()
+{
+  echo "Error: output_arrays not set"
+  echo ""
+  usage
+}
+
 TF_INTERFACE="--v1"
 
 # Parse command-line arguments
@@ -54,22 +89,37 @@ while [ "$#" -ne 0 ]; do
       ;;
     '--input_path')
       export INPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        input_not_set
+      fi
       shift 2
       ;;
     '--output_path')
       export OUTPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        output_not_set
+      fi
       shift 2
       ;;
     '--input_arrays')
       export INPUT_ARRAYS="$2"
+      if [ $# -lt 2 ]; then
+        input_arrays_not_set
+      fi
       shift 2
       ;;
     '--input_shapes')
       export INPUT_SHAPES="$2"
+      if [ $# -lt 2 ]; then
+        input_shapes_not_set
+      fi
       shift 2
       ;;
     '--output_arrays')
       export OUTPUT_ARRAYS="$2"
+      if [ $# -lt 2 ]; then
+        output_arrays_not_set
+      fi
       shift 2
       ;;
     '--v2')
@@ -90,6 +140,20 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   exit 2
 fi
 
+if [ -z ${INPUT_ARRAYS} ]; then
+  input_arrays_not_set
+fi
+
+# INPUT_SHAPES is optional
+
+if [ -z ${OUTPUT_PATH} ]; then
+  output_not_set
+fi
+
+if [ -z ${OUTPUT_ARRAYS} ]; then
+  output_arrays_not_set
+fi
+
 FILE_BASE=$(basename ${OUTPUT_PATH})
 MODEL_NAME="${FILE_BASE%.*}"
 
@@ -104,40 +168,58 @@ if [ -e ${VIRTUALENV_LINUX} ]; then
   source ${VIRTUALENV_LINUX}
 elif [ -e ${VIRTUALENV_WINDOWS} ]; then
   source ${VIRTUALENV_WINDOWS}
+else
+  echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command."
+  echo "If encounter any problems, please follow provided document in 'doc' folder."
+  exit 255
 fi
 
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
-# generate temporary preserved pb file
-echo "${DRIVER_PATH}/preserve_bcq_info" --input_path ${INPUT_PATH} \
---output_path "${TMPDIR}/${MODEL_NAME}_preserved.pb"  > "${OUTPUT_PATH}.log"
-echo " " >> "${OUTPUT_PATH}.log"
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
 
-"${DRIVER_PATH}/preserve_bcq_info" --input_path ${INPUT_PATH} \
---output_path "${TMPDIR}/${MODEL_NAME}_preserved.pb" >> "${OUTPUT_PATH}.log" 2>&1
+trap show_err_onexit ERR
 
-# generate output_arrays automatically
-echo "${DRIVER_PATH}/generate_bcq_output_arrays" \
---input_path "${TMPDIR}/${MODEL_NAME}_preserved.pb" \
---output_path "${TMPDIR}/${MODEL_NAME}_output_arrays.txt" > "${OUTPUT_PATH}.log"
-echo " " >> "${OUTPUT_PATH}.log"
+# Generate BCQ information metadata
+# If model has no BCQ information or invalid information, pb file is not changed.
+METAGEN_SCRIPT="${DRIVER_PATH}/generate_bcq_metadata "
+METAGEN_SCRIPT+="--input_path ${INPUT_PATH} "
+METAGEN_SCRIPT+="--output_path ${TMPDIR}/${MODEL_NAME}_withmeta.pb "
+METAGEN_SCRIPT+="--output_arrays ${OUTPUT_ARRAYS} "
+
+echo ${METAGEN_SCRIPT} > "${OUTPUT_PATH}.log"
+echo "" >> "${OUTPUT_PATH}.log"
+$METAGEN_SCRIPT >> "${OUTPUT_PATH}.log" 2>&1
+
+# Generate BCQ information nodes as output_arrays
+# If model has no BCQ information, output_arrays would be empty.
+OUTARR_GEN_SCRIPT="${DRIVER_PATH}/generate_bcq_output_arrays "
+OUTARR_GEN_SCRIPT+="--input_path ${TMPDIR}/${MODEL_NAME}_withmeta.pb "
+OUTARR_GEN_SCRIPT+="--metadata_path ${TMPDIR}/${MODEL_NAME}_metadata_arrays.txt "
+OUTARR_GEN_SCRIPT+="--output_arrays_path ${TMPDIR}/${MODEL_NAME}_output_arrays.txt "
 
-"${DRIVER_PATH}/generate_bcq_output_arrays" \
---input_path "${TMPDIR}/${MODEL_NAME}_preserved.pb" \
---output_path "${TMPDIR}/${MODEL_NAME}_output_arrays.txt" >> "${OUTPUT_PATH}.log" 2>&1
+echo ${OUTARR_GEN_SCRIPT} >> "${OUTPUT_PATH}.log"
+echo "" >> "${OUTPUT_PATH}.log"
+$OUTARR_GEN_SCRIPT >> "${OUTPUT_PATH}.log" 2>&1
 
-# generate temporary tflite file
+# generate tflite file
 CONVERT_SCRIPT="python ${DRIVER_PATH}/tf2tfliteV2.py ${TF_INTERFACE} "
-CONVERT_SCRIPT+="--input_path ${TMPDIR}/${MODEL_NAME}_preserved.pb "
+CONVERT_SCRIPT+="--input_path ${TMPDIR}/${MODEL_NAME}_withmeta.pb "
 CONVERT_SCRIPT+="--input_arrays ${INPUT_ARRAYS} "
 CONVERT_SCRIPT+="--output_path ${TMPDIR}/${MODEL_NAME}.tflite "
-CONVERT_SCRIPT+="--output_arrays ${OUTPUT_ARRAYS}$(cat ${TMPDIR}/${MODEL_NAME}_output_arrays.txt) "
+CONVERT_SCRIPT+="--output_arrays "
+CONVERT_SCRIPT+="$(cat ${TMPDIR}/${MODEL_NAME}_metadata_arrays.txt)"
+CONVERT_SCRIPT+="${OUTPUT_ARRAYS}"
+CONVERT_SCRIPT+="$(cat ${TMPDIR}/${MODEL_NAME}_output_arrays.txt) "
 if [ ! -z ${INPUT_SHAPES} ]; then
   CONVERT_SCRIPT+="--input_shapes ${INPUT_SHAPES} "
 fi
 
-echo ${CONVERT_SCRIPT} > "${OUTPUT_PATH}.log"
+echo ${CONVERT_SCRIPT} >> "${OUTPUT_PATH}.log"
 $CONVERT_SCRIPT >> "${OUTPUT_PATH}.log" 2>&1
 
 # convert .tflite to .circle
diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
index 58c686882..3b2b976e1 100644
--- a/compiler/one-cmds/one-import-tf
+++ b/compiler/one-cmds/one-import-tf
@@ -38,6 +38,41 @@ version()
   exit 255
 }
 
+input_not_set()
+{
+  echo "Error: input_path not set"
+  echo ""
+  usage
+}
+
+output_not_set()
+{
+  echo "Error: output_path not set"
+  echo ""
+  usage
+}
+
+input_arrays_not_set()
+{
+  echo "Error: input_arrays not set"
+  echo ""
+  usage
+}
+
+input_shapes_not_set()
+{
+  echo "Error: input_shapes not set"
+  echo ""
+  usage
+}
+
+output_arrays_not_set()
+{
+  echo "Error: output_arrays not set"
+  echo ""
+  usage
+}
+
 TF_INTERFACE="--v1"
 
 # Parse command-line arguments
@@ -54,22 +89,37 @@ while [ "$#" -ne 0 ]; do
       ;;
     '--input_path')
       export INPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        input_not_set
+      fi
       shift 2
       ;;
     '--output_path')
       export OUTPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        output_not_set
+      fi
       shift 2
       ;;
     '--input_arrays')
       export INPUT_ARRAYS="$2"
+      if [ $# -lt 2 ]; then
+        input_arrays_not_set
+      fi
       shift 2
       ;;
     '--input_shapes')
       export INPUT_SHAPES="$2"
+      if [ $# -lt 2 ]; then
+        input_shapes_not_set
+      fi
       shift 2
       ;;
     '--output_arrays')
       export OUTPUT_ARRAYS="$2"
+      if [ $# -lt 2 ]; then
+        output_arrays_not_set
+      fi
       shift 2
       ;;
     '--v2')
@@ -94,6 +144,20 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   exit 2
 fi
 
+if [ -z ${INPUT_ARRAYS} ]; then
+  input_arrays_not_set
+fi
+
+# INPUT_SHAPES is optional
+
+if [ -z ${OUTPUT_PATH} ]; then
+  output_not_set
+fi
+
+if [ -z ${OUTPUT_ARRAYS} ]; then
+  output_arrays_not_set
+fi
+
 FILE_BASE=$(basename ${OUTPUT_PATH})
 MODEL_NAME="${FILE_BASE%.*}"
 
@@ -108,6 +172,10 @@ if [ -e ${VIRTUALENV_LINUX} ]; then
   source ${VIRTUALENV_LINUX}
 elif [ -e ${VIRTUALENV_WINDOWS} ]; then
   source ${VIRTUALENV_WINDOWS}
+else
+  echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command."
+  echo "If encounter any problems, please follow provided document in 'doc' folder."
+  exit 255
 fi
 
 # remove previous log
diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
index 053489c92..0d5cef101 100644
--- a/compiler/one-cmds/one-import-tflite
+++ b/compiler/one-cmds/one-import-tflite
@@ -34,6 +34,20 @@ version()
   exit 255
 }
 
+input_not_set()
+{
+  echo "Error: input_path not set"
+  echo ""
+  usage
+}
+
+output_not_set()
+{
+  echo "Error: output_path not set"
+  echo ""
+  usage
+}
+
 # Parse command-line arguments
 #
 while [ "$#" -ne 0 ]; do
@@ -48,10 +62,16 @@ while [ "$#" -ne 0 ]; do
       ;;
     '--input_path')
       export INPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        input_not_set
+      fi
       shift 2
       ;;
     '--output_path')
       export OUTPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        output_not_set
+      fi
       shift 2
       ;;
     *)
@@ -67,6 +87,10 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   usage
 fi
 
+if [ -z ${OUTPUT_PATH} ]; then
+  output_not_set
+fi
+
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index 17b6b980e..e2427a9ef 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -24,6 +24,10 @@ usage()
   echo "Usage: one-optimize"
   echo "    --version       Show version information and exit"
   echo "    --all           Enable all optimization algorithms"
+  echo "    --fold_dequantize"
+  echo "                    Enable FoldDequantize Pass"
+  echo "    --fuse_add_with_tconv"
+  echo "                    Enable FuseAddWithTConv Pass"
   echo "    --fuse_bcq      Enable FuseBCQ Pass"
   echo "    --fuse_instnorm Enable FuseInstanceNormalization Pass"
   echo "    --resolve_customop_add"
@@ -44,6 +48,8 @@ version()
 }
 
 OPTIMIZE_all=0
+OPTIMIZE_fold_dequantize=0
+OPTIMIZE_fuse_add_with_tconv=0
 OPTIMIZE_fuse_bcq=0
 OPTIMIZE_fuse_instnorm=0
 OPTIMIZE_resolve_customop_add=0
@@ -66,6 +72,14 @@ while [ "$#" -ne 0 ]; do
       OPTIMIZE_all=1
       shift
       ;;
+    '--fold_dequantize')
+      OPTIMIZE_fold_dequantize=1
+      shift
+      ;;
+    '--fuse_add_with_tconv')
+      OPTIMIZE_fuse_add_with_tconv=1
+      shift
+      ;;
     '--fuse_bcq')
       OPTIMIZE_fuse_bcq=1
       shift
@@ -113,6 +127,12 @@ OPTIMIZE_OPTIONS=""
 if [ $OPTIMIZE_all == 1 ]; then
   OPTIMIZE_OPTIONS+="--all "
 fi
+if [ $OPTIMIZE_fold_dequantize == 1 ]; then
+  OPTIMIZE_OPTIONS+="--fold_dequantize "
+fi
+if [ $OPTIMIZE_fuse_add_with_tconv == 1 ]; then
+  OPTIMIZE_OPTIONS+="--fuse_add_with_tconv "
+fi
 if [ $OPTIMIZE_fuse_bcq == 1 ]; then
   OPTIMIZE_OPTIONS+="--fuse_bcq "
 fi
diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
index 023b0a85f..fe9f1bc37 100644
--- a/compiler/one-cmds/one-pack
+++ b/compiler/one-cmds/one-pack
@@ -34,6 +34,20 @@ version()
   exit 255
 }
 
+input_not_set()
+{
+  echo "Error: input path not set"
+  echo ""
+  usage
+}
+
+output_not_set()
+{
+  echo "Error: output path not set"
+  echo ""
+  usage
+}
+
 # Parse command-line arguments
 #
 while [ "$#" -ne 0 ]; do
@@ -51,10 +65,16 @@ while [ "$#" -ne 0 ]; do
       ;;
     '-i')
       export INPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        input_not_set
+      fi
       shift 2
       ;;
     '-o')
       export OUTPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        output_not_set
+      fi
       shift 2
       ;;
     *)
@@ -70,6 +90,10 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   usage
 fi
 
+if [ -z ${OUTPUT_PATH} ]; then
+  output_not_set
+fi
+
 INPUT_FILE=$(basename "${INPUT_PATH}")
 LOG_FILE="${INPUT_FILE%.*}.pack.log"
 
diff --git a/compiler/one-cmds/one-prepare-venv b/compiler/one-cmds/one-prepare-venv
index 0b11e7f0b..4fa6f519e 100644
--- a/compiler/one-cmds/one-prepare-venv
+++ b/compiler/one-cmds/one-prepare-venv
@@ -52,3 +52,7 @@ python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host file
   install -U pip==20.2.1 setuptools==49.3.0
 python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
   install tensorflow-cpu==2.3.0
+
+# Create python symoblic link
+rm -f ${DRIVER_PATH}/python
+ln -s venv/bin/python ${DRIVER_PATH}/python
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index c74b2c2d2..112bff337 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -24,7 +24,7 @@ usage()
   echo "Usage: one-quantize"
   echo "    --version         Show version information and exit"
   echo "    --input_dtype     Input data type (supported: float32, default=float32)"
-  echo "    --quantized_dtype Output quantized data type (supported: uint8, default=uint8)"
+  echo "    --quantized_dtype Output quantized data type (supported: uint8, int16, default=uint8)"
   echo "    --granularity     Quantize granularity (supported: layer, channel, default=layer)"
   echo "    --min_percentile  Minimum percentile (0.0~100.0, default=1.0)"
   echo "    --max_percentile  Maximum percentile (0.0~100.0, default=99.0)"
@@ -41,6 +41,69 @@ version()
   exit 255
 }
 
+input_not_set()
+{
+  echo "Error: input_path not set"
+  echo ""
+  usage
+}
+
+output_not_set()
+{
+  echo "Error: output_path not set"
+  echo ""
+  usage
+}
+
+input_data_not_set()
+{
+  echo "Error: input_data not set"
+  echo ""
+  usage
+}
+
+input_dtype_not_set()
+{
+  echo "Error: input_dtype not set"
+  echo ""
+  usage
+}
+
+quantized_dtype_not_set()
+{
+  echo "Error: quantized_dtype not set"
+  echo ""
+  usage
+}
+
+granularity_not_set()
+{
+  echo "Error: granularity not set"
+  echo ""
+  usage
+}
+
+min_percentile_not_set()
+{
+  echo "Error: min_percentile not set"
+  echo ""
+  usage
+}
+
+max_percentile_not_set()
+{
+  echo "Error: max_percentile not set"
+  echo ""
+  usage
+}
+
+mode_not_set()
+{
+  echo "Error: mode not set"
+  echo ""
+  usage
+}
+
 INPUT_DTYPE=float32
 QUANTIZED_DTYPE=uint8
 GRANULARITY=layer
@@ -63,39 +126,66 @@ while [ "$#" -ne 0 ]; do
 
     '--input_dtype')
       INPUT_DTYPE="$2"
+      if [ $# -lt 2 ]; then
+        input_dtype_not_set
+      fi
       shift 2
       ;;
     '--quantized_dtype')
       QUANTIZED_DTYPE="$2"
+      if [ $# -lt 2 ]; then
+        quantized_dtype_not_set
+      fi
       shift 2
       ;;
     '--granularity')
       GRANULARITY="$2"
+      if [ $# -lt 2 ]; then
+        granularity_not_set
+      fi
       shift 2
       ;;
     '--min_percentile')
       MIN_PERCENTILE="$2"
+      if [ $# -lt 2 ]; then
+        min_percentile_not_set
+      fi
       shift 2
       ;;
     '--max_percentile')
       MAX_PERCENTILE="$2"
+      if [ $# -lt 2 ]; then
+        max_percentile_not_set
+      fi
       shift 2
       ;;
     '--mode')
       MODE="$2"
+      if [ $# -lt 2 ]; then
+        mode_not_set
+      fi
       shift 2
       ;;
 
     '--input_path')
       INPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        input_not_set
+      fi
       shift 2
       ;;
     '--input_data')
       INPUT_DATA="$2"
+      if [ $# -lt 2 ]; then
+        input_data_not_set
+      fi
       shift 2
       ;;
     '--output_path')
       OUTPUT_PATH="$2"
+      if [ $# -lt 2 ]; then
+        output_not_set
+      fi
       shift 2
       ;;
 
@@ -116,6 +206,9 @@ if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then
   echo ""
   usage
 fi
+if [ -z ${OUTPUT_PATH} ]; then
+  output_not_set
+fi
 
 FILE_BASE=$(basename ${OUTPUT_PATH})
 MODEL_NAME="${FILE_BASE%.*}"
diff --git a/compiler/one-cmds/tests/CMakeLists.txt b/compiler/one-cmds/tests/CMakeLists.txt
new file mode 100644
index 000000000..cb1081d28
--- /dev/null
+++ b/compiler/one-cmds/tests/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Install one-cmds test scripts
+
+# Gather test scripts
+file(GLOB TESTITEMS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "./*.test")
+
+# Create a script to run the tests at installation folder
+set(DRIVER_SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/runtestall.sh")
+
+file(WRITE  "${DRIVER_SCRIPT}" "#!/bin/bash\n\n")
+file(APPEND "${DRIVER_SCRIPT}" "SCRIPT_PATH=$(cd $(dirname $\{BASH_SOURCE\[0\]\}) && pwd)\n")
+file(APPEND "${DRIVER_SCRIPT}" "pushd $SCRIPT_PATH > /dev/null\n")
+file(APPEND "${DRIVER_SCRIPT}" "rm -rf runtestall.log\n")
+file(APPEND "${DRIVER_SCRIPT}" "export PATH=$SCRIPT_PATH/../bin:$PATH\n")
+file(APPEND "${DRIVER_SCRIPT}" "if [[ $# -ge 1 ]]; then\n")
+file(APPEND "${DRIVER_SCRIPT}" "  USER_PATH=$1\n")
+file(APPEND "${DRIVER_SCRIPT}" "  export PATH=$USER_PATH:$PATH\n")
+file(APPEND "${DRIVER_SCRIPT}" "fi\n")
+file(APPEND "${DRIVER_SCRIPT}" "\n")
+
+foreach(TESTITEM IN ITEMS ${TESTITEMS})
+  get_filename_component(ITEM_PREFIX ${TESTITEM} NAME_WE)
+
+  set(TESTITEM_SCRIPT_FILE "${ITEM_PREFIX}.test")
+  set(TESTITEM_SCRIPT_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/${TESTITEM_SCRIPT_FILE}")
+
+  file(APPEND "${DRIVER_SCRIPT}" "/bin/bash ${TESTITEM_SCRIPT_FILE} | tee -a runtestall.log\n")
+
+  install(FILES ${TESTITEM} DESTINATION test)
+
+endforeach(TESTITEM)
+
+file(APPEND "${DRIVER_SCRIPT}" "popd> /dev/null")
+
+set(PREPARE_TEST_MATERIALS_SH "${CMAKE_CURRENT_SOURCE_DIR}/prepare_test_materials.sh")
+
+install(FILES ${DRIVER_SCRIPT}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
+
+install(FILES ${PREPARE_TEST_MATERIALS_SH}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
+
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.txt
+        DESTINATION test)
diff --git a/compiler/one-cmds/tests/README.txt b/compiler/one-cmds/tests/README.txt
new file mode 100644
index 000000000..0d4d0ecbe
--- /dev/null
+++ b/compiler/one-cmds/tests/README.txt
@@ -0,0 +1,27 @@
+one-cmds testing
+================
+
+Run 'runtestall.sh' program to test ONE command line programs, all at once.
+
+Steps:
+1) run 'one-prepare-venv' in bin folder to prepare python virtual-env with TensorFlow
+  - you need to run this only once
+  - read 'doc/how-to-prepare-virtualenv.txt' for more information
+----------------------------------------------
+bin/one-prepare-venv
+----------------------------------------------
+
+2) run 'test/prepare_test_materials.sh' to download test material models
+  - you need to run this only once
+  - you need internet connection to download files
+  - you may need to install 'wget' and 'unzip' packages
+----------------------------------------------
+test/prepare_test_materials.sh
+----------------------------------------------
+
+3) run 'test/runtestall.sh' to run the test
+----------------------------------------------
+test/runtestall.sh
+----------------------------------------------
+
+End.
diff --git a/compiler/one-cmds/tests/one-import-bcq_001.test b/compiler/one-cmds/tests/one-import-bcq_001.test
new file mode 100644
index 000000000..74b8bab32
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_001.test
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf $outputfile
+rm -rf $outputfile.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder \
+--output_arrays MatMul >> /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_001.test b/compiler/one-cmds/tests/one-import-bcq_neg_001.test
new file mode 100644
index 000000000..9a2b455ef
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_001.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input array
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: Invalid tensors" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder_null \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_002.test b/compiler/one-cmds/tests/one-import-bcq_neg_002.test
new file mode 100644
index 000000000..5779aa03e
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_002.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid output array
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: Invalid tensors" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder \
+--output_arrays MatMul_null > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_003.test b/compiler/one-cmds/tests/one-import-bcq_neg_003.test
new file mode 100644
index 000000000..3a8e6368b
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_003.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "input model not found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq_null.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_004.test b/compiler/one-cmds/tests/one-import-bcq_neg_004.test
new file mode 100644
index 000000000..dc83b98c8
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_004.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Error parsing message" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./while_3.pbtxt"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_005.test b/compiler/one-cmds/tests/one-import-bcq_neg_005.test
new file mode 100644
index 000000000..ad1196a67
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_005.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid output path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Failed to write circle" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="."
+
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_006.test b/compiler/one-cmds/tests/one-import-bcq_neg_006.test
new file mode 100644
index 000000000..f761aa7c0
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_006.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shapes
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: The shape of tensor" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder --input_shapes "1,32,32" \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_007.test b/compiler/one-cmds/tests/one-import-bcq_neg_007.test
new file mode 100644
index 000000000..5013254f5
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_007.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shapes
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: The shape of tensor" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder --input_shapes "30,30" \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_008.test b/compiler/one-cmds/tests/one-import-bcq_neg_008.test
new file mode 100644
index 000000000..e7d5d2e03
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_008.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shapes
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: invalid literal for" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder --input_shapes "32,O" \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-bcq_neg_009.test b/compiler/one-cmds/tests/one-import-bcq_neg_009.test
new file mode 100644
index 000000000..ef990438a
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-bcq_neg_009.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shapes
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "must have the same number of items" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./bcq.pb"
+outputfile="./bcq.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-import-bcq \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Placeholder --input_shapes "32,32:1" \
+--output_arrays MatMul > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_001.test b/compiler/one-cmds/tests/one-import_001.test
new file mode 100644
index 000000000..165f49193
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_001.test
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+# Note: Do not remove output circle file as it's used for quantize tests
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "1,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 >> /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-import_neg_001.test b/compiler/one-cmds/tests/one-import_neg_001.test
new file mode 100644
index 000000000..5a233d2f3
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_001.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage improper input model
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: Invalid tensors 'input' were found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.tflite"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "1,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_002.test b/compiler/one-cmds/tests/one-import_neg_002.test
new file mode 100644
index 000000000..78b77511a
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_002.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with unsupported dynamic tensor
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "is incompatible with result type" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./while_3.pbtxt"
+outputfile="./while_3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays Hole,Hole_2 --input_shapes "1,1:1,1" \
+--output_arrays Output > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_003.test b/compiler/one-cmds/tests/one-import_neg_003.test
new file mode 100644
index 000000000..b0e95b6d1
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_003.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid output array
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: Invalid tensors" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "1,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_2 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_004.test b/compiler/one-cmds/tests/one-import_neg_004.test
new file mode 100644
index 000000000..afb206ad9
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_004.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shape
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: The shape of tensor" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "1,299,299,1" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_005.test b/compiler/one-cmds/tests/one-import_neg_005.test
new file mode 100644
index 000000000..045fb89b7
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_005.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shape
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: The shape of tensor" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "1,299,299" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_006.test b/compiler/one-cmds/tests/one-import_neg_006.test
new file mode 100644
index 000000000..bb3e2344c
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_006.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shape
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ConverterError: <unknown>:0: error:" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "0,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_007.test b/compiler/one-cmds/tests/one-import_neg_007.test
new file mode 100644
index 000000000..bcfc4bcc9
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_007.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shape
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: invalid literal" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "None,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_008.test b/compiler/one-cmds/tests/one-import_neg_008.test
new file mode 100644
index 000000000..2f48fd708
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_008.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input shape
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "must have the same number of items" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input,InceptionV3/Predictions/Shape --input_shapes "1,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_009.test b/compiler/one-cmds/tests/one-import_neg_009.test
new file mode 100644
index 000000000..79e463d64
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_009.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid output path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Failed to write circle" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="."
+
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input --input_shapes "1,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_010.test b/compiler/one-cmds/tests/one-import_neg_010.test
new file mode 100644
index 000000000..05677a6d4
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_neg_010.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input_arrays
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "ValueError: Invalid tensors" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import tf \
+--input_path ${inputfile} \
+--output_path ${outputfile} \
+--input_arrays input2 --input_shapes "1,299,299,3" \
+--output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-optimize_001.test b/compiler/one-cmds/tests/one-optimize_001.test
new file mode 100644
index 000000000..240a62506
--- /dev/null
+++ b/compiler/one-cmds/tests/one-optimize_001.test
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3-opt.circle"
+
+rm -rf ${outputfile}
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-optimize --all \
+--input_path ${inputfile} \
+--output_path ${outputfile} >> /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-optimize_neg_001.test b/compiler/one-cmds/tests/one-optimize_neg_001.test
new file mode 100644
index 000000000..4ee509697
--- /dev/null
+++ b/compiler/one-cmds/tests/one-optimize_neg_001.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this test should fail
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Invalid input file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3-opt.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-optimize --all \
+--input_path ${inputfile} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-optimize_neg_002.test b/compiler/one-cmds/tests/one-optimize_neg_002.test
new file mode 100644
index 000000000..811b4d814
--- /dev/null
+++ b/compiler/one-cmds/tests/one-optimize_neg_002.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "input model not found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circletxt"
+outputfile="./inception_v3-opt.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-optimize --all \
+--input_path ${inputfile} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-optimize_neg_003.test b/compiler/one-cmds/tests/one-optimize_neg_003.test
new file mode 100644
index 000000000..17f53b9a4
--- /dev/null
+++ b/compiler/one-cmds/tests/one-optimize_neg_003.test
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage without output folder path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Failed to export" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-optimize --all \
+--input_path ${inputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-pack_001.test b/compiler/one-cmds/tests/one-pack_001.test
new file mode 100644
index 000000000..3d746dffd
--- /dev/null
+++ b/compiler/one-cmds/tests/one-pack_001.test
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfolder="nnpack"
+
+rm -rf ${outputfolder}
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-pack \
+-i ${inputfile} \
+-o ${outputfolder} >> /dev/null
+
+if [[ ! -d "${outputfolder}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-pack_neg_001.test b/compiler/one-cmds/tests/one-pack_neg_001.test
new file mode 100644
index 000000000..de1eab9c7
--- /dev/null
+++ b/compiler/one-cmds/tests/one-pack_neg_001.test
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "input model not found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+rm -rf ${filename}.log
+
+# run test
+one-pack \
+-i ./inception_v2.circle \
+-o nnpack > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-pack_neg_002.test b/compiler/one-cmds/tests/one-pack_neg_002.test
new file mode 100644
index 000000000..6427c260e
--- /dev/null
+++ b/compiler/one-cmds/tests/one-pack_neg_002.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with filename without extension
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "modelfile does not have extension" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+rm -rf ${filename}.log
+rm -rf nnpack
+
+# prepare dummy file
+touch ./sample
+
+# run test
+one-pack \
+-i ./sample \
+-o nnpack > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-pack_neg_003.test b/compiler/one-cmds/tests/one-pack_neg_003.test
new file mode 100644
index 000000000..bf91a5226
--- /dev/null
+++ b/compiler/one-cmds/tests/one-pack_neg_003.test
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage without output folder path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "output path not set" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+rm -rf ${filename}.log
+
+# prepare dummy file
+touch ./sample.circle
+
+# run test
+one-pack \
+-i ./sample.circle > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_001.test b/compiler/one-cmds/tests/one-quantize_001.test
new file mode 100644
index 000000000..ef25ac800
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_001.test
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ./inception_v3.circle \
+--input_data ./inception_v3_test_data.h5 \
+--output_path ./inception_v3.quantized.circle >> /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-quantize_neg_001.test b/compiler/one-cmds/tests/one-quantize_neg_001.test
new file mode 100644
index 000000000..ccf16fbf8
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_001.test
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with unsupported input dtype
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Unsupported input type" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# test begin
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+one-quantize \
+--input_dtype float64 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_002.test b/compiler/one-cmds/tests/one-quantize_neg_002.test
new file mode 100644
index 000000000..387e53637
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_002.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with unsupported input dtype
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Unsupported output type" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint16 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_003.test b/compiler/one-cmds/tests/one-quantize_neg_003.test
new file mode 100644
index 000000000..47cf0691f
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_003.test
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with wrong representative dataset
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Input shape mismatch" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ./mobilenet_test_data.h5 \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_004.test b/compiler/one-cmds/tests/one-quantize_neg_004.test
new file mode 100644
index 000000000..8d53ca1d1
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_004.test
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid output path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Failed to export" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="."
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_005.test b/compiler/one-cmds/tests/one-quantize_neg_005.test
new file mode 100644
index 000000000..4504730d0
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_005.test
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Invalid input file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./while_3.pbtxt"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_006.test b/compiler/one-cmds/tests/one-quantize_neg_006.test
new file mode 100644
index 000000000..2ecbc0413
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_006.test
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input path
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "input model not found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v2.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_007.test b/compiler/one-cmds/tests/one-quantize_neg_007.test
new file mode 100644
index 000000000..4796f9e89
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_007.test
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid input_data
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Given data file is not HDF5" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3.circle"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_008.test b/compiler/one-cmds/tests/one-quantize_neg_008.test
new file mode 100644
index 000000000..1656c6ba2
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_008.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid mode
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Unsupported mode" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--mode average \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_009.test b/compiler/one-cmds/tests/one-quantize_neg_009.test
new file mode 100644
index 000000000..6ecc5c32f
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_009.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid max_percentile
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Percentile must be ranged from" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--max_percentile 101 \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_010.test b/compiler/one-cmds/tests/one-quantize_neg_010.test
new file mode 100644
index 000000000..209645990
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_010.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid max_percentile
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Percentile must be ranged from" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--max_percentile -1 \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_011.test b/compiler/one-cmds/tests/one-quantize_neg_011.test
new file mode 100644
index 000000000..ea44c2ffe
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_011.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid min_percentile
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Percentile must be ranged from" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--min_percentile 101 \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_012.test b/compiler/one-cmds/tests/one-quantize_neg_012.test
new file mode 100644
index 000000000..b744051ae
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_012.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid min_percentile
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Percentile must be ranged from" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--min_percentile -1 \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_neg_013.test b/compiler/one-cmds/tests/one-quantize_neg_013.test
new file mode 100644
index 000000000..dec1cbd06
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_013.test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid min_percentile
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Unsupported granularity" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+inputdata="./inception_v3_test_data.h5"
+outputfile="./inception_v3.quantized.circle"
+
+rm -rf ${outputfile}.log
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ${inputfile} \
+--input_data ${inputdata} \
+--granularity layered \
+--output_path ${outputfile} > ${filename}.log
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/prepare_test_materials.sh b/compiler/one-cmds/tests/prepare_test_materials.sh
new file mode 100644
index 000000000..e241f44b5
--- /dev/null
+++ b/compiler/one-cmds/tests/prepare_test_materials.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See https://github.com/Samsung/ONE/issues/4155 for information
+
+SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+pushd $SCRIPT_PATH > /dev/null
+
+if [[ ! -s "inception_v3.pb" ]]; then
+    rm -rf inception_v3_2018_04_27.tgz
+    wget https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz
+    tar zxvf inception_v3_2018_04_27.tgz
+fi
+
+if [[ ! -s "while_3.pbtxt" ]]; then
+    rm -rf while_3.zip
+    wget https://github.com/Samsung/ONE/files/5095630/while_3.zip
+    unzip while_3.zip
+fi
+
+if [[ ! -s "inception_v3_test_data.h5" ]]; then
+    rm -rf inception_v3_test_data.zip
+    wget https://github.com/Samsung/ONE/files/5139370/inception_v3_test_data.zip
+    unzip inception_v3_test_data.zip
+fi
+
+if [[ ! -s "mobilenet_test_data.h5" ]]; then
+    rm -rf mobilenet_test_data.zip
+    wget https://github.com/Samsung/ONE/files/5139460/mobilenet_test_data.zip
+    unzip mobilenet_test_data.zip
+fi
+
+if [[ ! -s "bcq.pb" ]]; then
+    rm -rf bcq.pb.zip
+    wget https://github.com/Samsung/ONE/files/5153842/bcq.pb.zip
+    unzip bcq.pb.zip
+fi
+
+# prepare 'inception_v3.circle' file used for quantizatio test
+inputfile="./inception_v3.pb"
+outputfile="./inception_v3.circle"
+
+if [[ ! -s ${outputfile} ]]; then
+  ../bin/one-import-tf \
+  --input_path ${inputfile} \
+  --output_path ${outputfile} \
+  --input_arrays input --input_shapes "1,299,299,3" \
+  --output_arrays InceptionV3/Predictions/Reshape_1
+fi
+
+popd > /dev/null
diff --git a/compiler/oneco/CMakeLists.txt b/compiler/oneco/CMakeLists.txt
index 73bc57d43..10f466948 100644
--- a/compiler/oneco/CMakeLists.txt
+++ b/compiler/oneco/CMakeLists.txt
@@ -1,5 +1,5 @@
 nnas_find_package(Protobuf QUIET)
-nnas_find_package(ONNXSource EXACT 1.4.1 QUIET)
+nnas_find_package(ONNXSource EXACT 1.6.0 QUIET)
 
 if(NOT Protobuf_FOUND)
   return()
diff --git a/compiler/onnx-tools/CMakeLists.txt b/compiler/onnx-tools/CMakeLists.txt
new file mode 100644
index 000000000..ac4500e0e
--- /dev/null
+++ b/compiler/onnx-tools/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(ONNX_TOOL_FILES
+    onnx-dump.py
+    onnx-ops.py
+)
+
+foreach(ONNX_TOOL IN ITEMS ${ONNX_TOOL_FILES})
+
+  set(ONNX_TOOL_FILE ${ONNX_TOOL})
+  set(ONNX_TOOL_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${ONNX_TOOL_FILE}")
+  set(ONNX_TOOL_BIN "${CMAKE_CURRENT_BINARY_DIR}/${ONNX_TOOL_FILE}")
+  set(ONNX_TOOL_TARGET "${ONNX_TOOL}_target")
+
+  add_custom_command(OUTPUT ${ONNX_TOOL_BIN}
+    COMMAND ${CMAKE_COMMAND} -E copy "${ONNX_TOOL_SRC}" "${ONNX_TOOL_BIN}"
+    DEPENDS ${ONNX_TOOL_SRC}
+    COMMENT "Generate ${ONNX_TOOL_BIN}"
+  )
+
+  add_custom_target(${ONNX_TOOL_TARGET} ALL DEPENDS ${ONNX_TOOL_BIN})
+
+endforeach(ONNX_TOOL)
diff --git a/compiler/onnx-tools/README.md b/compiler/onnx-tools/README.md
new file mode 100644
index 000000000..f1b886132
--- /dev/null
+++ b/compiler/onnx-tools/README.md
@@ -0,0 +1,65 @@
+# onnx-tools
+
+_onnx-tools_ provides developer tools to support ONNX format in compiler frontend.
+
+## onnx-dump.py
+
+Use `onnx-dump.py` to dump ONNX model graph in human readable text format.
+
+For example,
+
+```
+[General] -----------------------------
+IR version = 6
+Producer   = pytorch 1.6
+
+[Operators] ---------------------------
+    3 Conv
+    3 Relu
+...
+
+[Initializers] ------------------------
+"0.bias"        FLOAT [16]
+"0.weight"      FLOAT [16, 1, 3, 3]
+...
+
+[Nodes] -------------------------------
+Conv("Conv_0")
+    A dilations: [1, 1], group: 1, kernel_shape: [3, 3], pads: [1, 1, 1, 1], strides: [2, 2]
+    I "input.1"
+    I "0.weight"
+    I "0.bias"
+    O "7"
+Relu("Relu_1")
+    I "7"
+    O "8"
+...
+
+[Graph Input/Output]-------------------
+    I: "input.1"       FLOAT [1, 1, 28, 28]
+    O: "21"            FLOAT [1, 10]
+```
+
+In `[Nodes]` section, `A` is for attributes for the node, `I` for input name and `O` for output name.
+
+`I` and `O` also applies to `[Graph Input/Output]` section.
+
+## onnx-ops.py
+
+Use `onnx-ops.py` to dump ONNX model operators.
+
+You can use with other command line tools to analyze operators in the model file.
+
+For example,
+```bash
+$ python onnx-ops.py mymodel.onnx | sort | uniq -c
+      1 Concat
+      1 Constant
+      3 Conv
+      1 Gather
+      1 GlobalAveragePool
+      3 Relu
+      1 Reshape
+      1 Shape
+      1 Unsqueeze
+```
diff --git a/compiler/onnx-tools/onnx-dump.py b/compiler/onnx-tools/onnx-dump.py
new file mode 100644
index 000000000..4f169cbe9
--- /dev/null
+++ b/compiler/onnx-tools/onnx-dump.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnx
+import os
+import sys
+
+from onnx import AttributeProto, TensorProto
+from onnx import numpy_helper
+from onnx import helper
+
+
+def _data_type_str(data_type):
+    return TensorProto.DataType.Name(data_type)
+
+
+def _get_attribute_value(attr):
+    if attr.type == AttributeProto.TENSOR:
+        return "{}, {}".format(
+            _data_type_str(attr.t.data_type), numpy_helper.to_array(attr.t))
+    if attr.type == AttributeProto.GRAPH:
+        # TODO revise when graph node is available
+        return "<graph>"
+    if attr.type == AttributeProto.TENSORS:
+        # TODO revise to see contents
+        return "<tensors>..."
+    if attr.type == AttributeProto.GRAPHS:
+        # TODO revise when graph node is available
+        return "<graphs>..."
+    return helper.get_attribute_value(attr)
+
+
+def _dump_header(onnx_model):
+    print("[General] -----------------------------")
+    print("IR version =", onnx_model.ir_version)
+    print("Producer   =", onnx_model.producer_name, onnx_model.producer_version)
+    print("")
+
+
+def _dump_operators(onnx_model):
+    opcodes_dict = dict()
+    for node in onnx_model.graph.node:
+        if node.op_type in opcodes_dict:
+            opcodes_dict[node.op_type] = opcodes_dict[node.op_type] + 1
+        else:
+            opcodes_dict[node.op_type] = 1
+
+    print("[Operators] ---------------------------")
+    for opcode_key in opcodes_dict:
+        print("{:>5} {}".format(opcodes_dict[opcode_key], opcode_key))
+
+    print("")
+
+
+def _dump_initializers(onnx_model):
+    print("[Initializers] ------------------------")
+    for initializer in onnx_model.graph.initializer:
+        init_name = '"{}"'.format(initializer.name)
+        dtstr = _data_type_str(initializer.data_type)
+        print('{:<15} {} {}'.format(init_name, dtstr, initializer.dims))
+
+    print("")
+
+
+def _dump_nodes(onnx_model):
+    print("[Nodes] -------------------------------")
+
+    for node in onnx_model.graph.node:
+        print('{0}("{1}")'.format(node.op_type, node.name))
+
+        attribute = ''
+        for attr in node.attribute:
+            if attribute != '':
+                attribute += ', '
+            attribute += "{}: {}".format(attr.name, _get_attribute_value(attr))
+
+        if attribute != '':
+            print('    A {0}'.format(attribute))
+
+        for inp in node.input:
+            print('    I "{0}"'.format(inp))
+        for out in node.output:
+            print('    O "{0}"'.format(out))
+
+    print("")
+
+
+def _dump_inputoutputs(onnx_model):
+    print("[Graph Input/Output]-------------------")
+    for mod_input in onnx_model.graph.input:
+        io_name = '"{}"'.format(mod_input.name)
+        dtstr = _data_type_str(mod_input.type.tensor_type.elem_type)
+        shape = mod_input.type.tensor_type.shape
+        input_shape = [dim.dim_value for dim in shape.dim]
+        print('    I: {:<15} {} {}'.format(io_name, dtstr, input_shape))
+
+    for mod_output in onnx_model.graph.output:
+        io_name = '"{}"'.format(mod_output.name)
+        dtstr = _data_type_str(mod_output.type.tensor_type.elem_type)
+        shape = mod_output.type.tensor_type.shape
+        output_shape = [dim.dim_value for dim in shape.dim]
+        print('    O: {:<15} {} {}'.format(io_name, dtstr, output_shape))
+
+    print("")
+
+
+def _dump_graph(onnx_model):
+    _dump_header(onnx_model)
+    _dump_operators(onnx_model)
+    _dump_initializers(onnx_model)
+    _dump_nodes(onnx_model)
+    _dump_inputoutputs(onnx_model)
+
+
+def _help_exit(cmd_name):
+    print('Dump ONNX model file Graph')
+    print('Usage: {0} [onnx_path]'.format(cmd_name))
+    print('')
+    exit()
+
+
+def main():
+    if len(sys.argv) < 2:
+        _help_exit(os.path.basename(sys.argv[0]))
+
+    onnx_model = onnx.load(sys.argv[1])
+    onnx.checker.check_model(onnx_model)
+
+    _dump_graph(onnx_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/onnx-tools/onnx-ops.py b/compiler/onnx-tools/onnx-ops.py
new file mode 100644
index 000000000..5292dc70e
--- /dev/null
+++ b/compiler/onnx-tools/onnx-ops.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnx
+import os
+import sys
+
+
+def _dump_operators(onnx_model):
+    for node in onnx_model.graph.node:
+        print(node.op_type)
+
+
+def _help_exit(cmd_name):
+    print('Dump ONNX model file Operators')
+    print('Usage: {0} [onnx_path]'.format(cmd_name))
+    print('')
+    exit()
+
+
+def main():
+    if len(sys.argv) < 2:
+        _help_exit(os.path.basename(sys.argv[0]))
+
+    onnx_model = onnx.load(sys.argv[1])
+    onnx.checker.check_model(onnx_model)
+
+    _dump_operators(onnx_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/onnxkit/CMakeLists.txt b/compiler/onnxkit/CMakeLists.txt
index 18f1ed423..81c3622c9 100644
--- a/compiler/onnxkit/CMakeLists.txt
+++ b/compiler/onnxkit/CMakeLists.txt
@@ -1,5 +1,5 @@
 nnas_find_package(Protobuf QUIET)
-nnas_find_package(ONNXSource EXACT 1.4.1 QUIET)
+nnas_find_package(ONNXSource EXACT 1.6.0 QUIET)
 
 if(NOT Protobuf_FOUND)
   return()
diff --git a/compiler/pota-quantization-value-test/README.md b/compiler/pota-quantization-value-test/README.md
index e3359ae4f..d6d003b4b 100644
--- a/compiler/pota-quantization-value-test/README.md
+++ b/compiler/pota-quantization-value-test/README.md
@@ -39,3 +39,17 @@ The expected output should include
  (1) scale, zero point of activations
  (2) scale, zero point, values of weights
  (3) scale, values (weights) of bias
+
+### Golden data
+
+Golden data was generated as follows.
+
+(1) Generate random h5 input for a target model (using gen_h5_random_inputs.py in `record-minmax-conversion-test`)
+
+(2) Run `dalgona` with the target model, input data, and analysis code named GenGoldenWeights.py for uint8 (GenGoldenWeightsSym.py for int16) (https://github.com/Samsung/ONE/pull/3501)
+
+(3) Do fake quantization using circle-quantizer
+
+(4) Run `dalgona` with the fake-quantized model, input data, and analysis code named GenGoldenActBias.py for uint8 (GenGoldenActBiasSym.py for int16) (https://github.com/Samsung/ONE/pull/3501)
+
+(5) Edit generated data for some operators (concat: scale propagation, mean: axis data)
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ifm1.json
new file mode 100644
index 000000000..3ae32419f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015018323028925806,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ifm2.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ifm2.json
new file mode 100644
index 000000000..ab968c9fc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ifm2.json
@@ -0,0 +1,32 @@
+{
+  "weights": [
+    [
+      [
+        [
+          4096,
+          8192,
+          -12288
+        ],
+        [
+          -16384,
+          -20479,
+          24575
+        ]
+      ],
+      [
+        [
+          -28671,
+          32767,
+          16384
+        ],
+        [
+          -8192,
+          12288,
+          -4096
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.0002441480755805969,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..4a1297410
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00037582992808893323,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/record_minmax/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/record_minmax/ifm1.json
new file mode 100644
index 000000000..4f17e12de
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/record_minmax/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.92105390548706,
+  "max": 4.809383983612061
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..6d2169066
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -11.397277755737305,
+  "max": 12.314819450378417
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ifm1.json
new file mode 100644
index 000000000..a223fa4aa
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.038489170372486115,
+  "zero_point": 129.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ifm2.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ifm2.json
new file mode 100644
index 000000000..ec6082d55
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ifm2.json
@@ -0,0 +1,32 @@
+{
+  "weights": [
+    [
+      [
+        [
+          136,
+          153,
+          68
+        ],
+        [
+          51,
+          34,
+          221
+        ]
+      ],
+      [
+        [
+          0,
+          255,
+          187
+        ],
+        [
+          85,
+          170,
+          102
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.05882352963089943,
+  "zero_point": 119.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..afa9b1a8e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0892433300614357,
+  "zero_point": 134.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/record_minmax/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/record_minmax/ifm1.json
new file mode 100644
index 000000000..0138d54cf
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/record_minmax/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.9815891456604,
+  "max": 4.833149127960205
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..8edbed5b6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Add_002/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -11.962269973754882,
+  "max": 10.79477970123291
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..6a5fc3e88
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0001524870313005522,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..b1fee1e89
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00013096666953060776,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..905be6038
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.996542510986329,
+  "max": 4.979214477539063
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..a35199a6f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -3.9937237644195553,
+  "max": 4.291385040283203
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/quantization/ifm.json
new file mode 100644
index 000000000..0528cc9cc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03911808878183365,
+  "zero_point": 127.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..ac5da0bda
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.027372928336262703,
+  "zero_point": 141.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..8701c51ff
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.9830295753479,
+  "max": 4.992084045410156
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..b2bb2d227
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/AveragePool2D_000/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -3.863597021102905,
+  "max": 3.1164999485015867
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ifm1.json
new file mode 100644
index 000000000..71265a270
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0002441480755805969,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ifm2.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ifm2.json
new file mode 100644
index 000000000..53d7cdba3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ifm2.json
@@ -0,0 +1,28 @@
+{
+  "weights": [
+    [
+      [
+        [
+          4096,
+          8192
+        ],
+        [
+          -12288,
+          -16384
+        ]
+      ],
+      [
+        [
+          -20479,
+          24575
+        ],
+        [
+          -28671,
+          32767
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.0002441480755805969,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..71265a270
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0002441480755805969,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/record_minmax/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/record_minmax/ifm1.json
new file mode 100644
index 000000000..1a3f56eb0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/record_minmax/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.83315715789795,
+  "max": 4.561212120056152
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..700674c7c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -7.0,
+  "max": 8.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ifm1.json
new file mode 100644
index 000000000..522880618
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.05882352963089943,
+  "zero_point": 119.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ifm2.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ifm2.json
new file mode 100644
index 000000000..17ba25363
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ifm2.json
@@ -0,0 +1,28 @@
+{
+  "weights": [
+    [
+      [
+        [
+          136,
+          153
+        ],
+        [
+          68,
+          51
+        ]
+      ],
+      [
+        [
+          34,
+          221
+        ],
+        [
+          0,
+          255
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.05882352963089943,
+  "zero_point": 119.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..522880618
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.05882352963089943,
+  "zero_point": 119.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/record_minmax/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/record_minmax/ifm1.json
new file mode 100644
index 000000000..dc8d1db1e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/record_minmax/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -2.8125765800476072,
+  "max": 4.720572299957276
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..700674c7c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Concatenation_001/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -7.0,
+  "max": 8.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/fake_quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/fake_quantization/ker.json
new file mode 100644
index 000000000..8817cbef7
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/fake_quantization/ker.json
@@ -0,0 +1,48 @@
+{
+  "weights": [
+    [
+      [
+        [
+          1.000030517578125,
+          2.00006103515625
+        ],
+        [
+          -3.000091552734375,
+          -4.0001220703125
+        ]
+      ],
+      [
+        [
+          -4.999908447265625,
+          5.99993896484375
+        ],
+        [
+          -6.999969482421875,
+          8.0
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          4.0001220703125,
+          -2.00006103515625
+        ],
+        [
+          3.000091552734375,
+          -1.000030517578125
+        ]
+      ],
+      [
+        [
+          -8.0,
+          -5.99993896484375
+        ],
+        [
+          6.999969482421875,
+          4.999908447265625
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/bias.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/bias.json
new file mode 100644
index 000000000..1b12a4d8d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/bias.json
@@ -0,0 +1,6 @@
+{
+  "weights": [
+    1.0,
+    2.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..b4b2e2136
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015194647130556405,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ker.json
new file mode 100644
index 000000000..94c794fbb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ker.json
@@ -0,0 +1,61 @@
+{
+  "weights": [
+    [
+      [
+        [
+          4096,
+          8192
+        ],
+        [
+          -12288,
+          -16384
+        ]
+      ],
+      [
+        [
+          -20479,
+          24575
+        ],
+        [
+          -28671,
+          32767
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          16384,
+          -8192
+        ],
+        [
+          12288,
+          -4096
+        ]
+      ],
+      [
+        [
+          -32767,
+          -24575
+        ],
+        [
+          28671,
+          20479
+        ]
+      ]
+    ]
+  ],
+  "scale": [
+    0.00024414807580797754,
+    0.00024414807580797754
+  ],
+  "zero_point": 0.0,
+  "min": [
+    -8.0,
+    -8.0
+  ],
+  "max": [
+    8.0,
+    8.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..5df65ebd4
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0018470257055014372,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..4136cdffe
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.9788299179077145,
+  "max": 4.917050857543946
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..0a35b161b
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": 0.0,
+  "max": 60.521490783691405
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/fake_quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/fake_quantization/ker.json
new file mode 100644
index 000000000..20c1f6759
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/fake_quantization/ker.json
@@ -0,0 +1,34 @@
+{
+  "weights": [
+    [
+      [
+        [
+          1.00018310546875,
+          2.0,
+          2.99981689453125,
+          4.0001220703125
+        ],
+        [
+          -9.00006103515625,
+          10.0,
+          -10.99993896484375,
+          11.9998779296875
+        ]
+      ],
+      [
+        [
+          5.0001220703125,
+          6.0,
+          6.9998779296875,
+          8.000244140625
+        ],
+        [
+          13.0,
+          -14.0,
+          15.0,
+          -16.0
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/bias.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/bias.json
new file mode 100644
index 000000000..e0573e4f9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/bias.json
@@ -0,0 +1,8 @@
+{
+  "weights": [
+    1.0,
+    2.0,
+    3.0,
+    4.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..2d4178372
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015058962162584066,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ker.json
new file mode 100644
index 000000000..d465a7c17
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ker.json
@@ -0,0 +1,53 @@
+{
+  "weights": [
+    [
+      [
+        [
+          2521,
+          4681,
+          6553,
+          8192
+        ],
+        [
+          -22685,
+          23405,
+          -24029,
+          24575
+        ]
+      ],
+      [
+        [
+          12603,
+          14043,
+          15291,
+          16384
+        ],
+        [
+          32767,
+          -32767,
+          32767,
+          -32767
+        ]
+      ]
+    ]
+  ],
+  "scale": [
+    0.0003967406231879635,
+    0.0004272591326639607,
+    0.0004577776421399579,
+    0.0004882961516159551
+  ],
+  "zero_point": 0.0,
+  "min": [
+    -13.0,
+    -14.0,
+    -15.0,
+    -16.0
+  ],
+  "max": [
+    13.0,
+    14.0,
+    15.0,
+    16.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..5f6a88ce8
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0034479827154427767,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..991c8d6d9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.733428707122803,
+  "max": 4.9343701171875
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..e6ec29252
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": 0.0,
+  "max": 112.98004760742187
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/fake_quantization/weight.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/fake_quantization/weight.json
new file mode 100644
index 000000000..559e537fc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/fake_quantization/weight.json
@@ -0,0 +1,76 @@
+{
+  "weights": [
+    [
+      1.000030517578125,
+      2.00006103515625,
+      -3.000091552734375,
+      -4.0001220703125,
+      -4.999908447265625,
+      5.99993896484375,
+      -6.999969482421875,
+      8.0,
+      4.0001220703125,
+      -2.00006103515625,
+      3.000091552734375,
+      -1.000030517578125,
+      -8.0,
+      -5.99993896484375,
+      6.999969482421875,
+      4.999908447265625
+    ],
+    [
+      1.000030517578125,
+      2.00006103515625,
+      -3.000091552734375,
+      -4.0001220703125,
+      -4.999908447265625,
+      5.99993896484375,
+      -6.999969482421875,
+      8.0,
+      4.0001220703125,
+      -2.00006103515625,
+      3.000091552734375,
+      -1.000030517578125,
+      -8.0,
+      -5.99993896484375,
+      6.999969482421875,
+      4.999908447265625
+    ],
+    [
+      1.000030517578125,
+      2.00006103515625,
+      -3.000091552734375,
+      -4.0001220703125,
+      -4.999908447265625,
+      5.99993896484375,
+      -6.999969482421875,
+      8.0,
+      4.0001220703125,
+      -2.00006103515625,
+      3.000091552734375,
+      -1.000030517578125,
+      -8.0,
+      -5.99993896484375,
+      6.999969482421875,
+      4.999908447265625
+    ],
+    [
+      1.000030517578125,
+      2.00006103515625,
+      -3.000091552734375,
+      -4.0001220703125,
+      -4.999908447265625,
+      5.99993896484375,
+      -6.999969482421875,
+      8.0,
+      4.0001220703125,
+      -2.00006103515625,
+      3.000091552734375,
+      -1.000030517578125,
+      -8.0,
+      -5.99993896484375,
+      6.999969482421875,
+      4.999908447265625
+    ]
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/bias.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/bias.json
new file mode 100644
index 000000000..1c3479fa3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/bias.json
@@ -0,0 +1,8 @@
+{
+  "weights": [
+    1.0,
+    -2.0,
+    -3.0,
+    4.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/in.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/in.json
new file mode 100644
index 000000000..ad24004c0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/in.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015171234554145485,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/out.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/out.json
new file mode 100644
index 000000000..42dc0edee
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/out.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0028422886971384287,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/weight.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/weight.json
new file mode 100644
index 000000000..69254d12b
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/quantization/weight.json
@@ -0,0 +1,95 @@
+{
+  "weights": [
+    [
+      4096,
+      8192,
+      -12288,
+      -16384,
+      -20479,
+      24575,
+      -28671,
+      32767,
+      16384,
+      -8192,
+      12288,
+      -4096,
+      -32767,
+      -24575,
+      28671,
+      20479
+    ],
+    [
+      4096,
+      8192,
+      -12288,
+      -16384,
+      -20479,
+      24575,
+      -28671,
+      32767,
+      16384,
+      -8192,
+      12288,
+      -4096,
+      -32767,
+      -24575,
+      28671,
+      20479
+    ],
+    [
+      4096,
+      8192,
+      -12288,
+      -16384,
+      -20479,
+      24575,
+      -28671,
+      32767,
+      16384,
+      -8192,
+      12288,
+      -4096,
+      -32767,
+      -24575,
+      28671,
+      20479
+    ],
+    [
+      4096,
+      8192,
+      -12288,
+      -16384,
+      -20479,
+      24575,
+      -28671,
+      32767,
+      16384,
+      -8192,
+      12288,
+      -4096,
+      -32767,
+      -24575,
+      28671,
+      20479
+    ]
+  ],
+  "scale": [
+    0.00024414807580797754,
+    0.00024414807580797754,
+    0.00024414807580797754,
+    0.00024414807580797754
+  ],
+  "zero_point": 0.0,
+  "min": [
+    -8.0,
+    -8.0,
+    -8.0,
+    -8.0
+  ],
+  "max": [
+    8.0,
+    8.0,
+    8.0,
+    8.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/record_minmax/in.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/record_minmax/in.json
new file mode 100644
index 000000000..dd4e1cb03
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/record_minmax/in.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.931026897430419,
+  "max": 4.971158237457275
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/record_minmax/out.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/record_minmax/out.json
new file mode 100644
index 000000000..763dce164
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/channel/int16/record_minmax/out.json
@@ -0,0 +1,4 @@
+{
+  "min": -91.51926612854004,
+  "max": 93.13327117919921
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..7bc9dff6d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015248177805915475,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..9fd574932
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015124233323149383,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..318cd008b
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.99637056350708,
+  "max": 4.955757389068604
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..e52196f1a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -2.257103805541992,
+  "max": 4.955757389068604
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/quantization/ifm.json
new file mode 100644
index 000000000..9bf6c9bff
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03876218944787979,
+  "zero_point": 126.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..87de1116e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.029836513102054596,
+  "zero_point": 88.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..bb42bdf8e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.901860733032226,
+  "max": 4.982497882843018
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..bb3a52516
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/MaxPool2D_000/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -2.6258130359649656,
+  "max": 4.982497882843018
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..6671787bc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015251722652465105,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..31d974626
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00013075214519631118,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..4f1bc7595
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.997532138824463,
+  "max": 4.995666198730469
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..fc5074dbe
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.224256687164306,
+  "max": 4.284355401992798
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/quantization/ifm.json
new file mode 100644
index 000000000..ede36c6ad
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.039086975157260895,
+  "zero_point": 128.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..bd2fc7f62
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.028692100197076797,
+  "zero_point": 131.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..ae1dc5e90
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.993542575836181,
+  "max": 4.97363561630249
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..527ed8d46
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mean_000/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -3.766610870361328,
+  "max": 3.5498746299743655
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ifm1.json
new file mode 100644
index 000000000..802c38bea
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015054467075970024,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ifm2.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ifm2.json
new file mode 100644
index 000000000..ab968c9fc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ifm2.json
@@ -0,0 +1,32 @@
+{
+  "weights": [
+    [
+      [
+        [
+          4096,
+          8192,
+          -12288
+        ],
+        [
+          -16384,
+          -20479,
+          24575
+        ]
+      ],
+      [
+        [
+          -28671,
+          32767,
+          16384
+        ],
+        [
+          -8192,
+          12288,
+          -4096
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.0002441480755805969,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..0c497ea3f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0008732788846828043,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/record_minmax/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/record_minmax/ifm1.json
new file mode 100644
index 000000000..c8a1be941
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/record_minmax/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.82299165725708,
+  "max": 4.932897224426269
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..12791d92c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -28.49001350402832,
+  "max": 28.614729080200195
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ifm1.json
new file mode 100644
index 000000000..bbff8952d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03780897706747055,
+  "zero_point": 131.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ifm2.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ifm2.json
new file mode 100644
index 000000000..ec6082d55
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ifm2.json
@@ -0,0 +1,32 @@
+{
+  "weights": [
+    [
+      [
+        [
+          136,
+          153,
+          68
+        ],
+        [
+          51,
+          34,
+          221
+        ]
+      ],
+      [
+        [
+          0,
+          255,
+          187
+        ],
+        [
+          85,
+          170,
+          102
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.05882352963089943,
+  "zero_point": 119.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..cec0bdf9a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.232084259390831,
+  "zero_point": 111.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/record_minmax/ifm1.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/record_minmax/ifm1.json
new file mode 100644
index 000000000..7cdb53424
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/record_minmax/ifm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.954726142883301,
+  "max": 4.686561832427978
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..5f63577ea
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Mul_001/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -25.874579315185546,
+  "max": 33.30691329956055
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json
new file mode 100644
index 000000000..5f6db8d72
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/alpha.json
@@ -0,0 +1,13 @@
+{
+  "weights": [
+    [
+      [
+        6553,
+        19660,
+        32767
+      ]
+    ]
+  ],
+  "scale": 1.5259254723787308e-05,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..b43bac4d7
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0001517884520580992,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..b43bac4d7
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0001517884520580992,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..321af6680
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.966492671966552,
+  "max": 4.97365219116211
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..2c18c21cb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -2.44303900718689,
+  "max": 4.97365219116211
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/alpha.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/alpha.json
new file mode 100644
index 000000000..7c001602f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/alpha.json
@@ -0,0 +1,13 @@
+{
+  "weights": [
+    [
+      [
+        51,
+        153,
+        255
+      ]
+    ]
+  ],
+  "scale": 0.0019607844296842813,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/ifm.json
new file mode 100644
index 000000000..05ce9dd2c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03849203139543533,
+  "zero_point": 127.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..8f883094a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.02848827838897705,
+  "zero_point": 82.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..76e719001
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.899785652160644,
+  "max": 4.915681838989258
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..2aa27ca64
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/PRelu_001/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -2.348829574584961,
+  "max": 4.915681838989258
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..a9a676169
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015181511116679758,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..29096744a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00014686971553601325,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..99fc6992c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.974525604248047,
+  "max": 4.812480030059814
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..dd17def11
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": 0.0,
+  "max": 4.812480030059814
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/quantization/ifm.json
new file mode 100644
index 000000000..3b97773ce
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03907399624586105,
+  "zero_point": 127.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..698a8a7ee
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.01955186203122139,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..fee2d92c0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.978144645690918,
+  "max": 4.985724964141846
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..bd6199fc0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/ReLU_000/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": 0.0,
+  "max": 4.985724964141846
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/fake_quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/fake_quantization/ker.json
new file mode 100644
index 000000000..6df24eb42
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/fake_quantization/ker.json
@@ -0,0 +1,48 @@
+{
+  "weights": [
+    [
+      [
+        [
+          0.999786376953125,
+          2.0001220703125
+        ],
+        [
+          -2.999908447265625,
+          -4.000244140625
+        ],
+        [
+          5.000030517578125,
+          -5.99981689453125
+        ]
+      ],
+      [
+        [
+          7.000152587890625,
+          7.99993896484375
+        ],
+        [
+          -9.000274658203125,
+          -10.00006103515625
+        ],
+        [
+          10.999847412109375,
+          -12.00018310546875
+        ]
+      ],
+      [
+        [
+          12.999969482421875,
+          13.999755859375
+        ],
+        [
+          -15.000091552734375,
+          -15.9998779296875
+        ],
+        [
+          17.000213623046875,
+          -18.0
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/.json
new file mode 100644
index 000000000..2fc53a0ff
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/.json
@@ -0,0 +1,3 @@
+{
+  "weights": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..27afd4f6a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00015109812375158072,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ker.json
new file mode 100644
index 000000000..8d0ceb1c6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ker.json
@@ -0,0 +1,58 @@
+{
+  "weights": [
+    [
+      [
+        [
+          1820,
+          3641
+        ],
+        [
+          -5461,
+          -7282
+        ],
+        [
+          9102,
+          -10922
+        ]
+      ],
+      [
+        [
+          12743,
+          14563
+        ],
+        [
+          -16384,
+          -18204
+        ],
+        [
+          20024,
+          -21845
+        ]
+      ],
+      [
+        [
+          23665,
+          25485
+        ],
+        [
+          -27306,
+          -29126
+        ],
+        [
+          30947,
+          -32767
+        ]
+      ]
+    ]
+  ],
+  "scale": [
+    0.0005493331705679495
+  ],
+  "zero_point": 0.0,
+  "min": [
+    -18.0
+  ],
+  "max": [
+    18.0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ofm.json
new file mode 100644
index 000000000..b39fcd7a4
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.008887303993105888,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..ae3f30db7
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.951032066345215,
+  "max": 4.942168235778809
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/record_minmax/ofm.json
new file mode 100644
index 000000000..348831b9e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/channel/int16/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -260.74616638183596,
+  "max": 291.21028076171876
+}
diff --git a/compiler/pota-quantization-value-test/test.lst b/compiler/pota-quantization-value-test/test.lst
index d9fd91761..15606b8e4 100644
--- a/compiler/pota-quantization-value-test/test.lst
+++ b/compiler/pota-quantization-value-test/test.lst
@@ -1,8 +1,28 @@
+addTest(Add_002 layer uint8)
+addTest(Add_002 channel int16)
+addTest(AveragePool2D_000 layer uint8)
+addTest(AveragePool2D_000 channel int16)
+addTest(Concatenation_001 layer uint8)
+addTest(Concatenation_001 channel int16)
 addTest(Conv2D_004 channel uint8)
+addTest(Conv2D_004 channel int16)
 addTest(Conv2D_004 layer uint8)
 addTest(DepthwiseConv2D_002 channel uint8)
+addTest(DepthwiseConv2D_002 channel int16)
 addTest(DepthwiseConv2D_002 layer uint8)
 addTest(FullyConnected_003 channel uint8)
+addTest(FullyConnected_003 channel int16)
 addTest(FullyConnected_003 layer uint8)
+addTest(Mean_000 layer uint8)
+addTest(Mean_000 channel int16)
+addTest(MaxPool2D_000 layer uint8)
+addTest(MaxPool2D_000 channel int16)
+addTest(Mul_001 layer uint8)
+addTest(Mul_001 channel int16)
+addTest(PRelu_001 layer uint8)
+addTest(PRelu_001 channel int16)
+addTest(ReLU_000 layer uint8)
+addTest(ReLU_000 channel int16)
 addTest(TransposeConv_001 channel uint8)
+addTest(TransposeConv_001 channel int16)
 addTest(TransposeConv_001 layer uint8)
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/0.txt
new file mode 100644
index 000000000..1fce3b67d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/0.txt
@@ -0,0 +1 @@
+ 2.9230394,-3.3165355, 4.3210225,-3.323507 , 1.5080413,-2.4125786,-2.1971512, 4.227092 ,-4.6573114, 3.6270325,-1.9319664, 3.9428957
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/1.txt
new file mode 100644
index 000000000..0425e1f25
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/1.txt
@@ -0,0 +1 @@
+-2.9018474,-0.9047734, 1.2572454,-3.90714  , 1.4757215, 3.2261674,-4.431676 , 4.318475 , 2.305025 , 3.5344698,-3.9441512,-1.9244975
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/2.txt
new file mode 100644
index 000000000..a3e5e910e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/2.txt
@@ -0,0 +1 @@
+ 0.42610177, 3.3458166 , 2.3041742 ,-1.8122445 , 0.48324248,-2.813166  ,-2.951401  ,-4.1868343 , 0.0872704 , 4.8097663 , 3.0866373 ,-4.744804  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/3.txt
new file mode 100644
index 000000000..b129666a3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/3.txt
@@ -0,0 +1 @@
+ 3.8244517 , 2.7528017 , 0.9391686 ,-2.3091493 ,-0.8263007 ,-3.7062955 , 1.0708941 ,-0.14978653,-2.7624338 , 1.8035483 ,-1.8124399 ,-3.5439892 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/4.txt
new file mode 100644
index 000000000..3efd52ea6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/channel/int16/4.txt
@@ -0,0 +1 @@
+ 4.8002086,-2.8242967, 1.2763925,-2.814267 , 2.3917456,-4.9283977,-3.571722 , 3.9308782, 0.1240977,-2.529994 ,-4.2461286,-1.8420148
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/0.txt
new file mode 100644
index 000000000..b6e2efa3d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/0.txt
@@ -0,0 +1 @@
+-0.8596993, 4.8127713,-3.4127183, 4.2323627,-2.2201376,-1.5362649,-4.9921966, 0.9565166, 3.2879171,-1.3590081,-3.771852 ,-4.1042285
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/1.txt
new file mode 100644
index 000000000..bcf2807ba
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/1.txt
@@ -0,0 +1 @@
+ 0.14624089, 4.7304125 , 4.833998  , 4.2321773 ,-2.0582533 ,-2.3694758 , 1.4213978 , 2.2444596 , 3.3630798 ,-0.70257574, 3.586656  ,-2.513805  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/2.txt
new file mode 100644
index 000000000..c3e32d2c5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/2.txt
@@ -0,0 +1 @@
+ 2.175218  , 0.02776978,-2.6291077 , 3.5350094 ,-1.2364857 ,-3.3151364 ,-0.92507887, 2.8038094 ,-1.8781518 , 3.6221995 , 2.4015775 ,-2.9217577 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/3.txt
new file mode 100644
index 000000000..a92abd4f6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/3.txt
@@ -0,0 +1 @@
+-1.0345451,-1.5055941,-4.144375 ,-4.727011 , 1.5841546, 4.5780725,-4.24402  ,-2.3966947,-3.0370803,-1.0234503,-0.2750057, 3.2965126
diff --git a/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/4.txt
new file mode 100644
index 000000000..2f2937fcb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Add_002/layer/uint8/4.txt
@@ -0,0 +1 @@
+-2.4460397 , 2.6090143 , 4.1773095 , 0.11204174,-3.3053472 , 2.5160108 ,-3.0612547 , 1.0667087 , 2.8952355 , 3.842513  , 0.6790793 ,-0.33375   
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/0.txt
new file mode 100644
index 000000000..d81e9c221
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/0.txt
@@ -0,0 +1 @@
+-0.510907  , 1.4089959 ,-2.1241407 , 2.4592288 ,-2.9779427 , 0.34636852, 2.2236073 ,-1.8599606 ,-2.5068367 ,-4.527614  ,-1.6316494 ,-3.9199774 ,-0.7226009 ,-2.4377425 , 2.1216922 , 3.505039  ,-3.7432935 ,-3.955653  , 3.6486173 ,-1.4690397 , 2.6959121 , 1.0808113 , 1.2353466 ,-0.22913602,-4.3915443 , 1.1245769 , 1.4626362 , 1.798673  , 1.7990736 , 0.89201635,-4.9303627 , 3.3033402 ,-4.865378  ,-4.7560496 ,-2.6321218 ,-2.4497926 ,-1.79407   ,-0.5770113 , 2.245869  , 3.885302  , 3.028755  ,-3.7664125 ,-3.6485636 ,-1.9558538 , 2.132231  ,-4.438173  ,-2.1077657 , 0.43832883, 2.14264   ,-4.680402  , 0.08914744,-2.1209    ,-4.8455005 ,-4.648564  ,-1.7070053 , 3.2453303 , 2.7448945 ,-0.36615932,-0.26296365, 2.1842651 , 0.6767488 ,-2.9756927 , 2.2691672 , 2.1817193 
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/1.txt
new file mode 100644
index 000000000..c8064ca4e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/1.txt
@@ -0,0 +1 @@
+ 4.4700613 , 2.578246  ,-3.0349965 ,-4.3796597 , 0.7688315 , 3.1931324 ,-1.9384179 , 1.7633927 , 4.016731  , 0.82653385,-1.3487217 , 0.5983474 , 4.9796753 , 1.2518785 ,-1.5897057 , 1.3097483 ,-3.5301888 , 4.7911434 ,-3.5171971 ,-1.9102467 ,-3.3005824 , 4.6697702 ,-2.2170236 ,-3.560176  ,-0.22950156, 1.2118694 ,-2.664568  , 1.37606   ,-3.2944875 , 0.6496278 ,-2.3036664 , 0.2789595 , 4.543674  , 2.1632304 , 1.707507  , 2.5155134 , 3.8962896 , 2.404059  , 4.56469   ,-0.10736128,-1.5589205 ,-2.0748043 , 0.6820406 ,-0.18399024, 2.8496518 ,-3.3359828 , 1.9705079 ,-3.4824357 ,-1.4309742 ,-4.7938485 , 1.5998274 , 1.5979563 , 1.3703158 ,-3.112683  ,-3.5520315 ,-0.04035192, 1.5651652 , 4.972494  , 0.8159539 ,-3.4126616 ,-3.4475358 , 0.65332323,-3.6707017 , 2.280633  
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/2.txt
new file mode 100644
index 000000000..1fdec4017
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/2.txt
@@ -0,0 +1 @@
+ 3.5354848e+00, 9.1399390e-01, 3.9115832e+00, 4.5195384e+00,-4.1488175e+00,-4.0608268e+00, 1.4575723e+00,-4.4007349e+00, 4.9681549e+00, 6.9878888e-01,-4.4128213e+00,-3.8517938e+00,-4.7306256e+00, 2.2618176e-01,-3.6951845e+00, 2.9969633e+00, 3.0128233e+00, 5.6477183e-01,-1.0598251e+00, 1.2899172e+00, 9.6092409e-01,-3.6659038e+00,-1.3825614e+00, 3.7959399e+00,-4.9993000e+00,-4.9668365e+00, 2.6209738e+00, 1.7419627e+00, 3.4296618e+00, 1.0272391e+00,-4.6021142e+00,-6.6580313e-01, 4.7928743e+00, 3.7836730e+00, 4.7099791e+00,-3.1748291e-03, 4.2090578e+00,-3.8970560e-01, 2.1450028e+00, 1.7832998e+00, 3.9223313e+00, 4.6870313e+00, 1.2538388e+00,-3.9964933e+00, 1.4849350e+00, 1.8416238e+00, 2.6485243e+00,-1.5326008e+00, 7.2043061e-01, 2.9865074e+00, 4.3521776e+00, 1.5164815e+00,-2.6253865e+00, 4.3517418e+00, 4.3981061e+00, 1.7968456e+00,-4.8398142e+00,-1.7621598e+00,-8.1946427e-01, 2.7901583e+00, 4.8448319e+00, 1.2321786e+00,-1.1610132e+00, 9.6679872e-01
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/3.txt
new file mode 100644
index 000000000..152f1eba1
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/3.txt
@@ -0,0 +1 @@
+-0.20464787, 2.5710201 , 0.59214157, 4.259379  ,-3.540491  ,-4.372994  , 2.3727858 , 0.69051546,-4.4903455 ,-0.9209778 ,-4.8972993 ,-4.2514396 ,-0.5489476 ,-4.057988  ,-2.1644015 ,-3.0984209 , 0.03034865,-0.04239147, 1.883851  ,-1.9254268 , 1.2777244 , 4.0522346 , 1.5753645 ,-4.6625195 , 4.15404   , 4.426107  ,-3.6130836 , 2.3792834 , 1.5132596 ,-1.441894  , 4.829869  , 3.089136  , 0.72886735, 1.7921836 ,-1.6057556 ,-3.0386446 ,-2.1475565 ,-2.0181272 , 4.7128267 , 4.0448503 ,-2.7346356 , 4.034245  , 4.6702056 ,-4.1446853 , 3.8138201 ,-1.426579  ,-1.9305505 ,-1.629861  , 4.905659  ,-1.3258263 ,-3.6109068 ,-1.8653567 , 1.6524535 , 2.7292273 , 2.7259579 ,-0.07184646,-4.0560045 ,-1.4042319 , 2.0582678 ,-3.1527088 ,-1.2431176 , 4.3459873 , 3.0906582 , 0.15360738
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/4.txt
new file mode 100644
index 000000000..572ed4eae
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/channel/int16/4.txt
@@ -0,0 +1 @@
+ 2.0034928 ,-3.0825787 ,-2.5465937 , 0.08314594,-1.1672878 ,-4.022598  , 1.6787933 , 1.3834887 , 4.716868  ,-4.367728  ,-4.5804863 ,-4.5252604 , 1.3848665 ,-0.12535067, 0.46983087,-4.486054  , 4.1792865 ,-2.1861317 ,-4.8893723 , 1.5433223 , 0.93831426, 3.021679  , 1.6269827 ,-1.6597769 ,-4.2329807 ,-2.364968  ,-0.7685641 , 1.7036824 , 4.3565035 ,-0.6187156 , 2.735659  , 3.8990567 , 1.31432   , 3.974897  , 2.3083837 ,-3.358307  , 0.5860206 , 3.1305172 , 2.6267734 ,-4.908361  ,-3.1134105 , 1.256104  , 0.43383893, 3.2501204 ,-4.7082167 ,-4.090534  , 0.5735267 ,-2.4570494 ,-0.96135706,-0.11614823,-0.68806463, 4.2216    , 1.9002053 ,-1.1091975 , 2.3109403 , 1.8851153 ,-0.82943046, 2.5827515 , 2.711629  ,-4.0693617 ,-2.2042627 , 3.8568714 , 2.7913945 ,-2.9618587 
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/0.txt
new file mode 100644
index 000000000..e42cbf88b
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/0.txt
@@ -0,0 +1 @@
+-4.1358833e+00, 1.7854472e+00, 4.1751757e+00, 5.5915713e-01,-2.6459083e-01,-1.7176826e+00,-1.8155930e+00, 2.8710868e+00,-2.7043006e+00, 1.0959731e+00,-2.0176995e+00,-6.5950048e-01,-3.6413522e+00,-4.1966043e+00,-2.6820884e+00,-3.6055098e+00, 3.6852844e+00, 8.9128174e-02, 1.3107824e+00,-3.6425626e+00,-3.2318896e-01, 3.6238370e+00,-4.9837337e+00,-4.0550299e+00,-1.4882606e+00, 1.5547658e+00,-1.1696080e+00, 2.1651111e+00, 4.9318314e+00,-3.5928023e+00,-1.2348548e+00,-1.7002642e+00, 1.7365140e+00,-8.8151926e-01,-4.1655774e+00,-1.0166957e+00,-3.7440193e+00, 2.8588972e+00, 4.1286149e+00,-4.9504828e+00, 4.8477168e+00,-2.2587967e+00, 2.8542519e+00,-7.9565448e-01, 6.8252671e-01, 2.5875571e-01,-6.3935977e-01,-4.8547015e+00, 4.1373856e-03,-1.3893708e+00, 8.8775367e-01, 2.1222150e-01, 3.1871333e+00, 1.3869151e+00,-3.8274391e+00, 3.2623324e+00, 7.2669631e-01, 1.0303619e+00, 8.1438148e-01, 8.1272924e-01,-2.7527118e+00, 1.8215455e+00,-1.6416427e-01, 4.9103169e+00
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/1.txt
new file mode 100644
index 000000000..7caf8ce9e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/1.txt
@@ -0,0 +1 @@
+-4.250757  , 1.4186406 , 0.63726735,-0.35924944, 1.9436699 , 3.2695885 , 3.6638293 , 4.5166173 , 1.3807241 ,-1.9112543 ,-1.9026492 ,-0.4800549 , 2.818216  ,-4.6390033 ,-3.8570547 , 3.6634028 ,-1.2112037 ,-1.3335027 , 1.3524677 , 2.7240725 ,-3.8335826 , 1.1397903 ,-3.1570992 ,-4.802078  , 3.8334577 , 0.23457901, 0.7132307 , 2.9887354 , 2.9702394 ,-1.4113717 ,-0.66712093, 0.77366674, 1.9308351 ,-0.45465755, 4.925366  , 2.4214447 , 2.8401468 , 0.49789894, 0.53141665,-2.7466767 , 0.2059374 ,-4.9661317 ,-4.1334467 , 1.6928389 ,-0.42529574, 1.1033608 , 4.275776  , 1.5063075 , 2.3528252 , 0.79505247, 3.9829993 ,-4.8472476 ,-1.2752185 , 3.7365675 , 1.976164  ,-4.742636  ,-2.7199092 ,-2.9191706 ,-3.181069  ,-4.489485  , 4.0847454 , 2.2164    , 0.9725334 ,-0.72566307
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/2.txt
new file mode 100644
index 000000000..7facffa57
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/2.txt
@@ -0,0 +1 @@
+-3.8293874 ,-0.13678598,-2.5444264 , 1.654611  ,-4.3037786 ,-3.4240584 ,-4.5642533 , 4.1250315 , 1.0469195 , 4.2802887 , 3.1617825 ,-3.1706758 ,-0.99622065, 2.7707603 , 3.7494645 ,-1.4548893 , 2.328633  , 1.7976477 ,-1.2107176 ,-2.0178459 ,-0.6488357 ,-2.9393644 , 2.8918762 , 3.6192262 ,-4.1777225 , 1.3264071 , 0.32620123, 0.7890992 ,-3.304334  , 3.4893208 , 2.5354576 ,-4.7718143 , 3.8602633 , 0.4927564 , 2.2971296 ,-0.3296792 , 2.8115997 ,-0.75152504, 0.558675  ,-2.343631  , 4.650826  ,-3.0893488 , 0.8726873 , 0.24922371, 2.7634025 , 1.0358421 ,-3.862506  ,-3.169402  ,-2.5373347 , 0.9484093 , 4.1409917 ,-4.0408096 ,-2.7231216 ,-2.548547  ,-2.6315095 , 0.8164778 ,-3.017436  , 1.1860138 ,-1.8634807 , 1.8684052 , 1.8657844 , 1.7747321 ,-3.1472425 ,-1.3989028 
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/3.txt
new file mode 100644
index 000000000..0be8fdd19
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/3.txt
@@ -0,0 +1 @@
+-2.0492268 ,-2.2555764 ,-1.3543441 ,-3.7278662 ,-4.8601675 , 3.1095552 , 4.6319957 , 3.0211062 , 1.7870535 , 4.8839574 ,-1.3494394 , 2.635408  ,-0.24201432, 1.312397  , 0.16790341, 2.42507   ,-3.101355  , 3.1760497 ,-4.500736  ,-2.53691   , 1.064206  , 0.62096214, 2.803344  ,-4.6166744 ,-4.624786  , 3.667064  ,-1.484021  , 4.9401817 ,-3.763283  , 3.4351027 ,-2.906393  , 4.9945946 ,-3.2997096 , 3.6325612 ,-0.47211674, 0.28783202, 1.8703817 ,-4.042374  ,-3.3353784 , 4.9085765 ,-1.6753131 ,-3.4926984 ,-4.8663344 ,-4.495712  , 2.3402312 ,-1.0722051 , 0.28559962, 2.1208072 , 1.3024254 , 3.4810693 , 0.09860361, 1.695624  , 1.3901931 , 1.6858819 , 3.8231227 , 4.5972557 ,-4.6835494 , 0.5753765 ,-2.2377403 , 0.13013013,-2.1165738 ,-0.26044115,-0.653468  , 1.1010929 
diff --git a/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/4.txt
new file mode 100644
index 000000000..7e2d618f9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/AveragePool2D_000/layer/uint8/4.txt
@@ -0,0 +1 @@
+ 4.397323  ,-0.51448834, 2.5729322 ,-4.3229046 , 1.149113  ,-3.8652143 ,-1.7352968 ,-0.7575065 ,-0.41720778, 4.327346  ,-4.2363043 , 0.8653738 ,-1.7511971 ,-0.7874244 ,-4.0734816 , 2.5622475 ,-3.1229742 ,-1.1783633 , 0.4017013 ,-0.76175183,-1.058416  , 1.128772  ,-3.0143378 ,-2.6688366 ,-2.575279  ,-4.326955  , 4.175434  , 4.791393  ,-1.10654   ,-4.4417224 , 3.5057635 , 1.5339037 ,-4.0297494 ,-3.7187057 ,-0.6645762 , 4.215642  , 1.6742749 , 2.5468905 , 1.73195   ,-3.3100636 ,-4.4818826 ,-2.5627983 ,-1.4624406 , 1.2433167 ,-4.005364  ,-4.3450556 ,-1.0652863 ,-1.0240986 , 3.989825  ,-4.1690702 ,-4.595108  ,-1.1154945 , 0.65749156, 2.5127344 , 2.509761  ,-4.3936505 , 3.6513395 ,-2.3340352 ,-4.3615093 , 3.5973237 , 0.9316653 , 1.9391845 , 3.6356397 , 0.8133118 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/0.txt
new file mode 100644
index 000000000..f6444c365
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/0.txt
@@ -0,0 +1 @@
+-1.9946578,-2.9780228, 4.1516557,-3.4539075
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/1.txt
new file mode 100644
index 000000000..3bb45b23a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/1.txt
@@ -0,0 +1 @@
+-1.3113928, 2.2005532, 3.6652274, 4.487159 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/2.txt
new file mode 100644
index 000000000..735651cb2
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/2.txt
@@ -0,0 +1 @@
+ 4.5642977,-1.9772011, 0.4170904,-4.8775287
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/3.txt
new file mode 100644
index 000000000..6ebe4022a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/3.txt
@@ -0,0 +1 @@
+ 0.7775621, 3.3111284, 3.5688167,-2.8435721
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/4.txt
new file mode 100644
index 000000000..cbe6a935f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/channel/int16/4.txt
@@ -0,0 +1 @@
+-2.1520452, 3.960813 , 2.9887378,-3.768241 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/0.txt
new file mode 100644
index 000000000..9def1c2eb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/0.txt
@@ -0,0 +1 @@
+0.24671102,3.271825  ,3.979895  ,1.3334678 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/1.txt
new file mode 100644
index 000000000..eaec2409f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/1.txt
@@ -0,0 +1 @@
+ 1.9181111, 2.2396102,-2.8641696,-1.9045062
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/2.txt
new file mode 100644
index 000000000..3e05181cc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/2.txt
@@ -0,0 +1 @@
+4.751434  ,2.8798263 ,0.15149078,2.9485583 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/3.txt
new file mode 100644
index 000000000..19d95b267
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/3.txt
@@ -0,0 +1 @@
+-1.5743442 , 0.6716824 , 0.75737774,-0.27396253
diff --git a/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/4.txt
new file mode 100644
index 000000000..d302e07a9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Concatenation_001/layer/uint8/4.txt
@@ -0,0 +1 @@
+-1.0539489 , 1.9595883 , 0.19975437, 2.526178  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/0.txt
new file mode 100644
index 000000000..7343be940
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/0.txt
@@ -0,0 +1 @@
+ 4.7784176 ,-3.664888  ,-3.661973  , 4.151922  ,-1.6849446 ,-3.5729008 ,-4.4134717 ,-4.4660335 ,-0.4282645 ,-3.3516212 , 4.3843856 ,-0.9578603 , 0.26670414, 3.1411405 ,-1.0374211 ,-2.5923402 ,-2.9806767 ,-0.8796363 , 2.035723  ,-2.8266547 , 2.2503827 , 1.0205121 ,-0.43321547, 1.2068397 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/1.txt
new file mode 100644
index 000000000..4d061522c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/1.txt
@@ -0,0 +1 @@
+-4.993156  , 2.1743555 ,-2.1204185 ,-4.686441  , 1.4717784 ,-3.7628727 , 2.2680001 , 0.5471788 ,-2.9423294 ,-3.7934408 ,-1.7061723 , 3.0367396 ,-3.9785733 , 3.5727022 ,-1.5245439 ,-1.426896  ,-1.4908481 ,-1.6985036 , 3.69504   ,-1.9351982 , 2.1623316 ,-4.7080107 , 4.9228272 , 0.65127385
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/2.txt
new file mode 100644
index 000000000..43cdf4782
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/2.txt
@@ -0,0 +1 @@
+-0.50205654,-2.502941  ,-3.8719609 ,-4.4488025 , 3.5040143 , 4.151995  ,-0.29484403,-0.12409137, 0.96817803, 2.830527  , 2.7696652 ,-4.286391  , 3.1336517 ,-4.016251  ,-3.8219085 , 0.12036578, 0.17878295, 1.2638323 , 1.6834463 , 0.14874293,-0.38150313, 0.8128281 ,-0.3667917 , 2.9356427 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/3.txt
new file mode 100644
index 000000000..dd4bef433
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/3.txt
@@ -0,0 +1 @@
+-0.34789565, 1.0015719 ,-0.06971588,-1.8463258 , 0.38877836, 3.7922597 ,-4.635005  , 0.09295294, 2.1624413 , 0.6197247 , 2.6534898 ,-1.1711022 , 0.2883494 , 1.5565678 , 4.760776  ,-2.0096483 ,-3.2912285 , 2.5289662 , 0.01655606, 3.8149574 ,-3.1482995 ,-0.20109762,-2.0927203 , 3.4672668 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/4.txt
new file mode 100644
index 000000000..e24f918c0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/channel/int16/4.txt
@@ -0,0 +1 @@
+-2.5574467 ,-3.2333457 , 0.78457445,-0.90771025,-2.122042  , 0.11078442,-4.327074  , 0.19774374, 3.3724785 ,-4.603591  , 0.577071  , 1.7609557 ,-4.16058   ,-3.5114238 ,-1.63763   , 1.1950372 , 1.5514423 , 2.041781  , 2.5683656 , 4.7605743 , 1.9907378 ,-2.6334338 , 2.421719  ,-0.28131783
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/0.txt
new file mode 100644
index 000000000..9b8e65bdb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/0.txt
@@ -0,0 +1 @@
+-0.47067887, 1.6245431 ,-3.827272  ,-0.8795752 , 4.1726565 , 0.8228656 ,-2.4443243 , 3.43631   ,-3.1509953 ,-2.5931532 ,-4.7419844 , 1.2288777 , 3.2510107 , 4.9364767 ,-4.497389  , 0.08496564
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/1.txt
new file mode 100644
index 000000000..1556a7661
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/1.txt
@@ -0,0 +1 @@
+ 3.9474714 ,-2.6625853 ,-2.283004  ,-0.02414002,-4.392698  , 4.240785  ,-2.4344695 ,-2.2256756 ,-3.9908662 ,-2.2188916 ,-0.11010418, 2.3360708 , 3.3392577 , 2.3586333 ,-0.8532365 ,-4.500675  
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/2.txt
new file mode 100644
index 000000000..e655c07cf
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/2.txt
@@ -0,0 +1 @@
+ 0.10326682, 2.1289074 , 3.5456502 , 0.35302642,-4.4911804 ,-2.134726  ,-2.0466588 ,-2.3322723 ,-1.3623004 ,-1.9854587 , 0.1923696 , 3.6274686 ,-1.5344565 ,-4.15489   ,-2.5438864 , 3.8615432 
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/3.txt
new file mode 100644
index 000000000..80f93444f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/3.txt
@@ -0,0 +1 @@
+ 3.5030484 , 1.4227937 ,-2.9866784 ,-2.2149732 , 1.9371717 , 4.883812  , 1.8504329 ,-4.089092  , 3.9873357 ,-0.856626  , 2.1024404 ,-2.6687756 ,-3.3716505 , 0.7224773 , 3.8326802 , 0.00597815
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/4.txt
new file mode 100644
index 000000000..62f006bc3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/channel/int16/4.txt
@@ -0,0 +1 @@
+-3.540293  , 0.9187398 ,-1.7232021 , 4.2609916 ,-0.36517945,-4.528093  , 2.0078566 ,-2.8224776 ,-0.52138734, 2.083479  , 4.13081   , 1.6346717 ,-1.786156  ,-1.868778  ,-2.003354  , 4.861706  
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/0.txt
new file mode 100644
index 000000000..0f8cc18e3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/0.txt
@@ -0,0 +1 @@
+-2.096356 ,-2.8830805, 3.2734153, 4.75676  ,-1.4854555,-3.761438 , 4.9110246, 2.0026941,-0.5639237,-0.8497009, 0.2410297,-0.5871555, 3.2145824,-3.0885973,-3.6895127, 2.1789384
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/1.txt
new file mode 100644
index 000000000..c50bf39a9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/1.txt
@@ -0,0 +1 @@
+ 1.1109806 , 0.7826781 ,-0.74992985,-1.7466942 , 1.580436  ,-2.9916067 ,-3.5476816 , 3.8817286 , 1.6322273 ,-2.4337368 , 2.7259283 ,-3.9455266 ,-0.49857467,-0.08708442, 1.8806074 , 1.2781891 
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/2.txt
new file mode 100644
index 000000000..593319fae
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/2.txt
@@ -0,0 +1 @@
+ 0.00969718,-1.6622301 ,-4.5473747 , 2.6495044 , 2.09744   ,-3.0735466 ,-1.4153806 ,-0.5514852 , 2.0267386 , 2.7606184 , 0.65523773,-4.741409  , 3.9757168 ,-1.4596124 , 0.16563408, 4.203863  
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/3.txt
new file mode 100644
index 000000000..b78a8b931
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/3.txt
@@ -0,0 +1 @@
+ 0.9968256, 3.1801567, 4.973664 ,-4.5165515,-1.3277559, 2.102693 ,-4.122782 ,-4.2825947, 3.6787858,-2.7863328, 2.5955915,-3.6259568, 3.8748455,-2.6964746, 4.419685 , 1.4595481
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/4.txt
new file mode 100644
index 000000000..2ca9c88c5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/channel/int16/4.txt
@@ -0,0 +1 @@
+ 0.38209972,-1.798579  , 0.30720893,-2.2759287 , 2.5577447 ,-2.3818388 ,-4.9389277 ,-1.6131732 , 4.463589  , 0.99052995,-3.0197868 ,-4.8446875 ,-0.9983996 , 1.3928102 , 4.7655883 ,-4.643676  
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/0.txt
new file mode 100644
index 000000000..34c8d9efa
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/0.txt
@@ -0,0 +1 @@
+ 4.8284526 ,-2.691639  ,-4.6848307 ,-2.197698  ,-1.6452677 ,-4.8613544 , 1.5453953 ,-0.39886513, 4.0380425 ,-0.04039171, 2.301464  ,-3.714465  ,-4.421984  ,-4.1743374 , 3.4737499 , 2.588904  , 3.1158638 ,-1.1720241 ,-3.5342755 ,-3.751013  , 0.01882128,-2.8355627 ,-1.4244885 , 2.297474  ,-3.1879032 ,-0.44498882,-4.940659  , 2.730317  , 2.8211846 ,-2.5893457 , 2.2638    ,-2.6992602 , 2.401231  , 1.2854915 , 2.346975  , 0.5765137 ,-3.2890596 , 2.063855  ,-4.731475  ,-0.557063  , 0.54349065,-1.5450648 ,-0.5341154 ,-3.4168484 ,-2.0516255 , 4.346786  ,-4.3981385 , 4.674038  ,-2.3534555 ,-4.851739  , 3.9623237 ,-3.9519038 ,-3.5602655 ,-2.0898452 ,-1.3825879 , 4.720879  , 4.5703683 , 1.9761374 , 3.492937  ,-2.2800248 ,-3.1707876 ,-2.384202  , 1.2742764 ,-0.76953995
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/1.txt
new file mode 100644
index 000000000..8f4f6af7f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/1.txt
@@ -0,0 +1 @@
+ 0.48372003,-4.9410214 ,-1.1078873 , 1.4024726 , 0.8605548 ,-3.7719595 ,-4.6056633 ,-1.6772615 , 4.8427873 , 3.5070498 , 0.9382978 , 3.5805914 ,-4.9232273 , 3.5987895 , 1.9776107 ,-2.147376  ,-0.17856362, 1.4812931 , 4.740502  , 2.6861246 ,-3.5170615 , 3.935952  ,-0.6979904 ,-4.5876417 ,-4.137302  ,-0.10453341, 1.5544755 ,-3.1888318 ,-3.6421032 , 2.1766756 ,-2.4349477 , 0.5977967 ,-0.47258878,-3.3868475 ,-2.2278457 , 4.69384   ,-2.4523015 , 1.7757872 , 1.4693154 ,-0.50499505, 3.0612497 , 2.0326838 , 3.9193847 ,-2.5797045 ,-3.61123   ,-1.6678554 ,-1.3377333 ,-4.931451  , 4.662463  ,-1.4456365 , 0.61296666, 1.3043345 , 4.7175946 , 1.1552657 , 2.8492255 , 3.9504783 , 1.7083023 , 0.593394  ,-0.42887765, 0.8986678 , 0.61605704, 0.49256468,-0.20637548,-0.72854817
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/2.txt
new file mode 100644
index 000000000..c666b1db5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/2.txt
@@ -0,0 +1 @@
+-1.2767923 , 4.635175  , 4.634353  , 2.2961338 ,-4.525523  , 4.7173486 ,-1.118744  ,-2.0162437 ,-4.089979  , 4.2400355 ,-2.121403  , 1.3392338 ,-0.14014104,-2.2286527 ,-4.3265066 ,-2.9784994 ,-0.9419822 ,-3.659519  , 3.0981843 , 0.95809066, 1.6028512 ,-2.7508621 ,-1.1240566 , 0.6601117 ,-3.3664095 , 3.5669692 ,-4.5485654 ,-4.025274  , 4.5913305 ,-4.5493083 ,-3.7018857 , 4.950167  , 4.0111384 ,-2.1159806 ,-4.297305  ,-0.74918365,-3.2612495 , 2.8310492 ,-0.7887121 , 3.533797  , 0.29970253, 0.80299735, 4.7283497 , 4.5975323 ,-0.5759047 , 1.4421924 , 1.6174877 ,-3.5346878 ,-0.3994039 , 1.3907117 , 4.46784   , 0.61307186,-2.4161408 ,-0.78354657, 0.8080768 , 2.6124284 ,-4.398268  ,-3.96591   , 0.7457658 , 0.17488185,-4.315872  , 2.2258658 ,-4.692843  , 2.3056333 
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/3.txt
new file mode 100644
index 000000000..89c21282d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/3.txt
@@ -0,0 +1 @@
+ 4.4875917 , 0.81472397, 3.6098514 ,-1.0605507 , 0.16224344,-0.02751762, 4.3322067 , 1.3057764 , 4.8877854 ,-3.1575646 ,-0.27372208,-2.8751788 , 1.5704893 , 1.7208904 ,-0.90212977, 3.367112  , 1.1692609 ,-3.2632163 ,-0.9717545 , 3.848303  ,-3.8752236 , 0.07510394,-3.4816995 , 2.816553  ,-0.92604953, 3.4131455 , 1.5475662 , 0.9335745 ,-1.9984822 ,-0.88401717, 1.7502712 , 1.6837802 , 3.7524889 ,-4.0362206 , 0.10621884, 1.3092963 , 3.9410071 , 4.8334627 ,-0.5977371 ,-3.9537277 , 2.6244955 , 3.84642   , 3.866643  , 2.263317  , 4.471196  ,-2.5969682 ,-4.848496  , 3.8593578 ,-2.3364575 , 0.93461806,-4.9278994 ,-3.981684  , 0.79419816, 3.9157522 ,-1.6737558 ,-2.6708112 , 1.8820635 ,-3.9298456 , 0.36201027,-4.8502846 ,-4.7925324 , 4.528682  , 3.6746473 ,-2.756068  
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/4.txt
new file mode 100644
index 000000000..b99983b43
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/channel/int16/4.txt
@@ -0,0 +1 @@
+-3.157378  , 0.18785916,-2.142268  ,-3.3184152 , 4.1268454 ,-1.8841478 ,-1.2312942 ,-1.8732002 , 4.8144236 , 3.3052166 , 1.6794424 ,-4.1501184 ,-3.8177824 , 0.4225169 , 4.045759  ,-4.03065   , 0.5957578 ,-3.2322567 ,-4.649256  ,-4.227309  ,-1.7070007 , 3.4748268 , 4.4875193 ,-4.104651  ,-2.1271632 , 4.7759695 , 4.427194  , 4.3794928 , 4.3293757 ,-0.7616468 ,-2.7183442 ,-2.5630903 , 3.8545947 , 0.7578416 , 3.865139  ,-0.60673547,-4.998677  , 1.6777763 ,-3.0278416 , 4.9559903 , 2.2378786 ,-4.918824  , 2.6796322 ,-0.575707  ,-4.9119215 ,-0.26456386, 2.9720273 ,-4.655656  , 1.5877472 , 3.5130315 ,-4.509794  ,-3.7033517 ,-0.7976036 ,-0.17557316, 2.7904704 ,-4.195402  , 1.6284761 , 1.9448873 , 3.0018816 ,-2.6292274 ,-2.475803  , 1.2488724 ,-1.0502579 , 1.1644874 
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/0.txt
new file mode 100644
index 000000000..31a2db03e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/0.txt
@@ -0,0 +1 @@
+-4.1984134 , 3.7565446 , 1.3521377 ,-4.0263743 ,-1.929471  ,-3.7523155 , 1.3858393 , 4.1565247 ,-2.4681342 , 0.3598748 ,-2.0044599 , 3.7168603 , 3.6330557 , 3.0176272 ,-4.4643235 ,-0.1893698 , 3.8839848 ,-4.5703125 , 3.365731  , 4.5556674 , 4.954971  , 1.7591819 ,-0.9497736 ,-0.8527185 ,-1.1863561 ,-4.522639  ,-4.3187394 ,-3.702939  , 0.15341021, 0.8564923 , 1.9076811 , 4.2765    ,-3.7695112 ,-1.6033245 , 2.3159432 ,-1.6656336 , 1.4186145 , 4.334284  , 4.0654674 ,-4.518256  , 0.72815216, 2.5133176 ,-4.238172  , 1.0198449 ,-0.9638457 , 2.5847483 , 4.0381308 , 4.472872  , 0.11794223, 1.3358012 , 1.7975981 , 2.168553  ,-3.5131238 , 3.8412008 , 3.851232  ,-2.130775  , 3.556102  , 0.69062364,-4.668594  ,-4.619906  ,-2.87768   ,-1.0679495 ,-4.523185  , 4.184176  
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/1.txt
new file mode 100644
index 000000000..2bdd62b24
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/1.txt
@@ -0,0 +1 @@
+ 2.9193265 , 4.315574  ,-3.7834768 , 3.4352486 , 4.1452866 ,-4.0322523 , 1.8039155 ,-4.080042  ,-1.1999705 , 4.9018297 ,-0.27180746, 1.709373  , 4.3322196 , 4.9179945 ,-3.977508  , 2.3486571 ,-0.11026379,-0.24730131, 2.3269305 , 2.1862001 , 0.92486495, 3.5822759 , 2.8370361 , 3.915398  ,-0.6385275 ,-0.02720119,-1.408676  ,-4.4472733 , 1.2901759 ,-4.60209   ,-2.9502335 ,-2.650517  ,-1.4038593 ,-2.967456  ,-2.0060933 ,-1.9603083 ,-0.4727794 ,-1.7877682 ,-3.9565926 , 1.4452418 , 2.5925353 ,-4.5134907 ,-4.195412  , 2.4681656 , 0.7140492 , 3.0753498 , 0.269442  ,-4.768041  ,-3.5370746 , 1.0272335 ,-0.7654047 ,-1.977087  , 3.1920779 , 0.37378865, 4.016262  ,-3.3201067 ,-4.7767315 ,-3.5074112 ,-4.094166  , 1.6035818 , 1.6506963 ,-3.2142932 , 4.7714067 ,-1.7164946 
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/2.txt
new file mode 100644
index 000000000..8c770f61d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/2.txt
@@ -0,0 +1 @@
+-1.8028042 , 1.7280815 ,-3.0464594 ,-2.810487  , 0.582805  ,-1.786865  ,-1.7263526 ,-0.36871073, 3.3955328 ,-3.9523299 ,-1.880003  , 4.9068613 , 4.6292953 , 3.9778202 ,-1.859954  , 2.8149757 , 4.5020967 ,-4.160163  , 1.9295161 ,-1.2508658 , 0.5669804 , 0.99246883,-2.4829247 , 0.88920474,-3.7942843 , 2.4626305 , 4.3087935 , 3.0680852 , 3.0893688 , 3.1640174 ,-0.41890725, 0.5377459 ,-4.0344224 ,-4.5812287 , 0.5720303 , 1.802316  ,-0.31413126, 2.9586952 , 1.1723012 ,-4.696369  ,-3.7047153 ,-1.8109767 ,-3.6122723 , 1.2727392 , 4.4057164 , 3.8347735 ,-4.739083  , 2.4655118 , 0.45258832, 4.0693913 ,-3.3486447 ,-0.64714307, 1.4990507 , 2.771129  ,-0.6109979 ,-1.0617865 , 2.0837703 ,-1.633663  , 1.8431798 ,-4.3942385 , 4.8523426 , 1.1941985 , 3.0366988 , 4.7991366 
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/3.txt
new file mode 100644
index 000000000..8a4c9ebb5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/3.txt
@@ -0,0 +1 @@
+-2.2375767 ,-1.1274278 , 0.18025301,-4.598087  , 1.1042122 , 3.1241179 , 1.9084688 ,-1.214722  , 4.596646  , 4.1969523 , 4.658112  , 3.143779  ,-2.6940444 ,-1.5482163 , 1.542811  ,-1.1338089 , 3.721594  , 0.24673286, 4.71102   , 2.7811737 , 1.171089  , 4.145586  ,-2.6335135 , 1.1190183 ,-3.7932637 ,-4.6548123 ,-3.10302   ,-3.392706  ,-3.856141  , 0.6618614 , 0.9668614 , 4.4293485 , 1.3193    , 4.983464  , 1.659716  ,-3.185926  , 4.8983006 , 1.6323217 , 0.18800464,-1.9328839 , 4.6031475 , 3.459718  , 4.128766  ,-3.4701612 ,-2.3796144 , 1.6752707 ,-3.6569223 , 2.922704  , 3.642789  ,-1.6817225 , 3.151759  ,-1.5401909 ,-3.8259532 , 2.4556105 ,-4.4989905 , 1.2779988 ,-0.62634754, 3.5827441 ,-0.82541114, 2.1539748 , 4.583461  , 1.2231985 ,-1.4457659 ,-2.9194565 
diff --git a/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/4.txt
new file mode 100644
index 000000000..5110f86aa
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/MaxPool2D_000/layer/uint8/4.txt
@@ -0,0 +1 @@
+-4.011289  , 0.9077414 ,-2.8109396 ,-4.33598   ,-2.6516347 ,-3.917852  , 3.2461808 , 1.7588768 ,-1.9439132 , 2.190185  , 1.5180751 , 0.3587409 ,-4.3434815 ,-4.1376143 , 3.750847  , 1.5820616 , 0.03843357, 4.71235   , 1.0592757 ,-1.7640393 , 0.44547582, 2.8698466 , 4.5816092 , 4.6638517 , 1.4207541 , 1.863644  , 3.6007912 , 0.6800818 ,-2.4884489 , 3.0707197 , 3.3961668 ,-4.331953  , 2.7828538 ,-0.16146964,-4.9070745 ,-2.9787786 , 0.3337284 ,-3.935533  ,-3.303555  , 2.376896  ,-4.7058997 ,-2.2409894 , 0.07352693,-2.6024988 , 4.9593167 ,-4.7717366 , 1.6590588 , 4.063875  ,-3.8855767 , 2.6274624 , 4.901856  , 4.157007  ,-3.292969  , 3.579326  , 3.9860668 ,-3.0936542 ,-4.7793274 , 0.71697485,-2.0354068 ,-2.1414943 , 3.6339438 , 0.10732502,-0.86129206, 4.4152017 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/0.txt
new file mode 100644
index 000000000..0a60ff544
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/0.txt
@@ -0,0 +1 @@
+-2.2074962 , 2.34192   ,-2.709864  , 1.9128946 ,-2.2117736 , 3.7492905 ,-3.846714  ,-0.14091022,-0.05969257,-0.7129768 , 3.6450825 , 2.5212145 , 0.7989227 ,-2.5739667 , 3.970536  ,-2.6660712 ,-1.8958896 , 2.300999  , 3.1438015 , 1.2797564 , 0.08288309,-4.293988  ,-2.2450752 ,-3.3478742 , 0.15746279, 4.5632443 ,-2.6077847 , 2.8849685 ,-4.3597856 , 3.5858352 ,-1.0896981 ,-2.922654  , 1.7538116 , 0.95464027,-3.7287776 ,-2.7478595 , 3.6285002 ,-0.41975695,-0.35182676, 3.6266248 , 1.3659422 , 3.8589127 ,-4.233086  , 0.8218454 ,-3.4972434 , 4.615624  , 1.9660673 ,-0.70521665,-2.0720336 , 2.7834241 ,-2.358451  , 3.1219418 , 0.5374302 , 0.93982154, 0.55175847, 1.659779  , 1.5089085 ,-3.2819571 ,-0.1463864 ,-4.5264926 , 3.4349985 , 4.2010264 ,-1.7860808 , 1.8264372 ,-1.4440862 , 1.1329567 , 1.9254624 , 1.7995164 ,-2.663578  ,-4.799549  ,-1.6716652 ,-4.423277  ,-0.26002383, 3.1433315 , 2.2171266 , 4.1119127 ,-3.1298876 , 0.6021499 , 1.9682903 ,-2.8893127 , 3.4758754 ,-4.870843  ,-4.800037  ,-4.1607656 , 0.47251   ,-3.0049233 ,-2.155954  , 2.175629  ,-2.0159044 , 4.8229475 ,-4.4880743 ,-0.984097  , 4.52838   ,-4.676001  ,-0.35018834, 3.1078057 ,-3.6997106 ,-3.1540651 , 0.45590773,-1.3007423 , 4.9675007 ,-4.5856795 , 4.144678  , 2.9743934 ,-1.6183054 ,-0.5026187 ,-0.45754507,-3.0440507 ,-4.2907186 ,-1.5810571 ,-3.3099668 ,-2.9118912 ,-2.1923656 , 2.0955439 , 2.3726344 ,-4.7950087 , 3.3958588 ,-0.55981565,-1.808636  ,-4.7620907 , 4.050268  ,-2.427531  ,-1.1281991 , 2.8483155 , 1.5577518 ,-1.1400249 , 1.5824546 ,-4.822372  ,-4.9407682 , 4.3104033 ,-0.07276537, 2.0490766 ,-2.0670223 ,-2.3535848 , 0.5239419 , 0.37653625,-3.6111376 ,-4.8165507 ,-3.231536  , 1.7372189 , 4.8433757 ,-0.72669244, 4.395995  ,-0.5532281 ,-2.7518723 ,-0.46534494, 1.229831  , 2.8872015 , 1.0584512 ,-0.83641016, 3.9595466 , 3.253532  ,-3.605762  , 0.78982335, 4.0811715 , 3.3635073 ,-1.9074551 , 1.7357349 ,-1.8057578 , 2.2644591 ,-0.91638184,-1.9505771 , 1.9017309 , 1.9173802 ,-1.2468227 , 2.0470798 ,-4.1916666 , 1.6746832 ,-3.0363586 , 4.785653  ,-0.53253794, 4.34697   , 0.61329865, 4.6044106 ,-4.347125  , 3.0415568 , 0.85138226,-3.42608   ,-3.600788  , 2.4055347 ,-0.6169718 , 4.00248   , 0.25979143,-0.88910925,-0.22139451, 4.357685  ,-3.0035346 , 3.5880077 ,-1.8679962 ,-4.3341603 ,-4.9402742 , 1.428447  ,-2.847086  , 2.1966681 ,-4.905821  ,-1.7971262 , 1.8915637 , 2.2758172 ,-1.3374038 ,-3.0997677 , 1.736807  , 4.50119   ,-2.6054833 ,-3.0651853 , 4.685231  , 0.893667  ,-1.1920239 , 4.8786464 , 2.7724717 ,-1.4378481 , 4.3327556 , 1.2405481 , 1.8002224 , 2.0331264 , 1.1052294 ,-1.3300043 , 2.8293777 , 0.7841656 , 0.47411916, 3.7697432 , 1.7962953 ,-2.545056  , 3.43001   , 4.9719415 ,-4.8534617 , 3.4633894 ,-2.566673  ,-0.80052173, 1.4536021 ,-1.5736219 ,-3.9482996 ,-1.5102588 , 1.1547778 ,-4.6677794 , 3.2731333 ,-4.059292  ,-1.1536314 , 0.84613454,-0.4756105 , 2.231564  , 2.0913293 ,-2.983188  , 3.1336026 , 1.9971098 , 4.7132673 , 0.2544511 ,-1.7493942 ,-4.0819116 , 4.5831046 , 4.983917  ,-1.4336734 , 2.770158  , 1.7522675 , 1.7341546 ,-3.7084382 , 2.820421  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/1.txt
new file mode 100644
index 000000000..243ae7ac5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/1.txt
@@ -0,0 +1 @@
+ 4.647416  , 4.9679117 , 3.1563618 , 2.097811  ,-4.791759  ,-0.7455675 ,-4.6186633 ,-1.3497456 ,-0.3574563 , 2.7634604 ,-4.788801  , 4.08388   , 4.967569  ,-0.91977555, 2.4698513 ,-3.888301  , 4.3066487 , 0.9423939 , 0.05353882,-3.3108146 ,-4.990351  ,-4.979936  ,-3.8345113 , 3.8786337 ,-2.8743896 , 3.924127  , 3.0873044 , 2.4747493 , 1.9881026 , 4.2850347 , 4.7974505 ,-4.713631  , 4.4727798 , 4.3543534 ,-0.58807266, 2.8979537 , 4.9961557 ,-2.088324  ,-0.11115739, 1.3920798 , 0.73489606,-0.26524037,-3.0612137 ,-2.2441452 , 2.4489026 ,-1.1380061 , 3.3269863 , 0.39811748, 0.06907854, 3.5682824 ,-3.195291  , 3.2613728 ,-4.175812  ,-1.1097213 , 0.01470342,-3.9216485 , 2.005782  ,-4.5967135 , 0.2766234 , 0.50879586, 0.9087977 ,-4.582055  , 3.9930925 ,-3.2334676 , 4.063303  , 3.280631  ,-4.08185   , 1.6128619 , 3.684287  ,-2.5079417 , 1.29454   ,-0.8991583 ,-4.7053046 , 2.5193672 , 1.4263965 , 4.2287545 , 3.2037172 , 0.24987306,-1.4175392 ,-2.7125013 ,-1.5511144 ,-2.6464562 ,-4.910153  , 4.2622967 ,-1.2177622 ,-0.2950588 , 3.0605023 , 4.2451105 ,-0.4860682 , 0.0829033 ,-1.8873469 ,-0.12465403, 3.1823132 ,-4.1410875 ,-0.57973766,-1.9339868 , 2.4527252 ,-4.1042285 , 1.2916591 ,-0.43612963,-4.5488896 ,-3.3105433 ,-4.9393787 ,-4.237338  , 3.408978  ,-3.0267413 , 0.44544792,-1.9766037 ,-1.5030789 ,-2.8587856 , 3.3850634 ,-3.9410233 ,-4.052784  , 0.5824722 ,-0.07491566, 0.57859915, 0.72390985,-1.8118609 , 0.3350256 ,-1.5373514 ,-0.03153525,-2.8978603 ,-0.86536926,-2.2109356 , 2.7301579 , 4.0579762 ,-1.4266934 , 2.51175   ,-0.6414894 ,-1.3042144 ,-2.4148684 ,-0.11735582, 4.4275966 ,-4.1753044 , 0.222479  , 0.57152927,-1.3205781 ,-1.6346083 ,-2.0534148 ,-1.5933816 , 2.4688685 , 1.7039679 , 4.0698404 ,-1.6570914 ,-4.4231663 , 1.6994585 ,-4.159936  , 1.8877546 , 2.5192266 , 2.1880152 , 0.1552987 , 2.8667293 ,-0.24308844,-2.5318801 ,-0.9884318 , 4.88736   ,-3.9448256 ,-3.54328   , 0.9494098 ,-1.4893662 , 4.6291637 ,-4.2713094 ,-1.2336831 ,-3.3976934 ,-4.9367647 , 1.7812147 ,-0.44779322, 1.8611335 , 1.6815461 , 3.4643054 ,-3.7002919 ,-2.6072085 ,-0.8358446 , 4.5414486 ,-1.4070685 ,-3.1732821 ,-1.147933  , 3.8136475 , 4.4463906 , 4.713708  ,-2.2642767 , 1.165452  , 2.0734186 , 4.6640544 , 4.5748796 ,-1.13552   , 0.0123144 , 1.2279074 , 4.5093293 ,-4.8872313 , 2.9312396 , 3.077511  ,-3.3208303 ,-3.5604925 ,-0.8427212 , 2.5253384 , 2.4704914 ,-3.271838  , 0.01163167,-0.75937307, 4.766512  , 3.2750878 , 3.757167  ,-4.989937  , 2.4368346 , 0.7778476 , 2.5371192 , 3.3019354 , 1.1787901 ,-4.6297603 ,-0.24127986,-2.6508133 , 3.2619736 , 1.6687889 ,-2.3318424 , 4.952497  ,-4.0604258 , 0.26896703,-4.83261   , 0.1513177 ,-3.3763385 , 3.9463341 ,-0.34016863,-1.4536736 ,-3.4208305 , 4.289349  , 0.5334734 ,-1.3894193 , 4.3037252 , 1.5239613 ,-0.43751678,-4.979746  , 1.5469617 ,-4.5540833 , 2.0537195 , 0.70013916,-0.1546976 , 0.7859695 , 4.67385   , 2.8416243 ,-4.6232533 , 2.8257215 ,-3.9204762 , 0.26588315,-1.0345049 , 2.9870927 ,-0.8294203 , 2.4436429 , 3.861819  ,-2.6891387 ,-0.09799351,-4.276479  , 2.288106  ,-4.5913887 ,-2.4511378 ,-2.5102944 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/2.txt
new file mode 100644
index 000000000..4a983f972
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/2.txt
@@ -0,0 +1 @@
+ 4.8798766e+00, 4.9243131e+00, 4.8819971e+00, 2.5457311e+00, 2.3260918e+00,-3.6621268e+00, 4.9664865e+00,-2.5198615e-01, 4.4377279e+00,-8.5797942e-01, 4.5949574e+00,-2.7489040e+00, 4.1843333e+00,-2.6608632e+00,-1.3712088e+00,-4.2638259e+00,-1.4219848e+00,-4.5535688e+00, 2.1673117e+00, 4.9565008e-01, 4.2422485e+00,-1.7685415e+00, 3.0243170e+00,-1.6031052e+00,-2.7372043e+00,-3.1805620e+00,-5.3593642e-01, 1.4621491e+00,-2.6961849e+00, 2.0458367e+00,-3.1595535e+00,-4.0961289e+00, 1.1753587e+00, 9.3906957e-01,-4.3159337e+00,-1.3725852e+00, 1.3072140e+00, 1.5345490e-01,-3.6739161e+00,-4.8221606e-01,-4.6497369e+00, 4.3760514e+00,-1.9013669e+00,-1.4918685e+00,-8.0158120e-01, 4.4854193e+00, 1.2181104e+00, 3.5900481e-02, 3.0020311e+00, 5.5376965e-01,-6.3398939e-01, 4.8934984e+00,-3.2580850e+00,-3.7261260e+00, 3.9005661e-04, 1.4454360e+00,-4.2321515e+00,-2.7544975e+00,-1.7314111e+00, 1.5144792e+00,-3.2667000e+00,-5.9683466e-01, 2.1015060e+00,-2.3534837e+00,-5.7822615e-01, 3.0378773e+00, 1.0944736e+00,-4.0058913e+00,-2.0754468e+00,-4.5949697e+00, 4.4344592e+00,-1.3073648e+00,-6.3490713e-01, 1.8540500e-01,-4.5599942e+00, 2.8582293e-01,-1.7934854e+00,-7.0607591e-01, 2.0355430e+00,-4.6090999e+00, 1.7777715e+00, 4.8398571e+00,-1.4936711e+00, 1.7854555e-01,-9.4804019e-01, 4.1062727e+00, 4.7332139e+00, 2.9031954e+00, 4.8915653e+00, 1.9979690e+00,-8.1200880e-01,-9.2868048e-01,-4.3415589e+00,-3.7189934e+00, 1.6964570e+00, 2.0999928e+00,-8.4056407e-01,-4.4467325e+00,-3.9956238e+00,-2.0207644e+00, 6.6059715e-01,-2.0528836e+00, 4.6259356e+00,-4.9360566e+00,-3.8701682e+00, 4.5166662e-01,-4.4485717e+00, 3.5869598e+00,-3.6782477e+00,-4.8435564e+00, 2.4834676e+00,-4.3192039e+00,-1.8272166e+00,-1.0558940e-01,-6.3537258e-01,-3.1022735e+00,-4.1558928e+00,-4.8120623e+00, 2.7207766e+00,-2.2537978e+00,-4.5614452e+00,-3.2558560e+00, 4.6432371e+00, 1.7005105e+00, 3.1943071e+00,-4.9606085e+00, 4.6081147e+00, 2.0349250e+00,-3.7596638e+00,-2.1179686e+00,-1.2983947e+00,-3.3367488e+00, 2.8882802e-01,-2.5551078e+00,-7.5433016e-01,-9.0093839e-01,-1.8420401e-01, 2.8669918e+00, 4.5551214e+00,-9.2204064e-02, 4.0048879e-01, 4.7526321e+00, 4.6743426e+00, 3.9137602e+00, 4.0956023e-01, 4.0974464e+00, 4.2964678e+00, 4.8832207e+00, 9.2255503e-01,-4.7051301e+00, 2.1469953e+00,-3.7833872e+00,-3.3037670e+00,-2.9071469e+00,-1.1760893e+00,-4.9690862e+00,-2.1627474e+00,-9.6528506e-01,-1.1314354e+00,-4.3561492e+00,-4.4200878e+00, 3.3768594e+00,-3.6639380e+00,-3.9702466e+00,-3.3278401e+00,-4.9813142e+00,-1.6705159e+00,-5.9130019e-01, 3.0274973e+00,-1.7484089e+00,-3.5171264e-01, 3.0469453e+00,-3.2046111e+00, 4.8803782e+00, 3.1684818e+00,-4.3790922e+00, 7.5879174e-01, 4.5218272e+00,-2.5914080e+00,-3.0454910e+00, 3.4617710e-01,-2.2997477e+00, 9.6962333e-02,-8.6270535e-01, 2.7255342e+00,-1.0079044e+00,-3.9956221e-01, 1.5535468e+00,-1.7760757e+00, 3.6640015e+00, 2.6955497e+00,-2.9474480e+00, 1.8736525e+00, 2.2837176e+00,-2.0603726e+00, 4.2487226e+00, 2.9422033e+00, 4.2111969e+00,-2.6478791e+00, 7.1064776e-01, 3.2437108e-02,-2.9048746e+00,-2.6201909e+00, 3.8928044e+00,-1.3717099e+00, 3.5247872e+00,-2.6093180e+00, 4.8967223e+00, 1.6301818e+00,-4.7146568e+00,-3.0934784e-01,-1.1863232e+00,-8.2591486e-01,-4.5406181e-02, 2.0123808e+00,-2.5105116e+00, 5.1519338e-02,-2.2054148e+00, 3.3810835e+00,-1.6602491e+00,-4.0132251e+00,-8.6186725e-01, 1.5905821e-01,-3.9856031e+00,-3.4374268e+00,-7.7546448e-01, 2.1148517e+00, 9.7378927e-01,-9.8872207e-02, 6.4961392e-01,-2.2246380e+00,-2.7060149e+00, 4.4127951e+00, 2.0425823e+00,-1.5409564e+00,-4.3303204e+00,-1.5762631e+00, 3.9626603e+00, 4.0119057e+00,-3.5879347e+00,-8.1219703e-01, 3.5089598e+00,-4.6350203e+00, 4.2187424e+00,-3.4364567e+00,-4.1966414e+00,-1.3043808e+00,-3.6387622e+00, 8.4374899e-01,-2.0959642e+00,-4.9362640e+00,-2.3911943e+00, 3.4672189e+00,-1.7254879e+00,-4.1084757e+00, 3.5413663e+00
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/3.txt
new file mode 100644
index 000000000..eb263afcc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/3.txt
@@ -0,0 +1 @@
+-2.8310468 , 1.7082282 , 1.0476835 , 4.861815  , 0.17206787, 2.6601248 , 3.7602353 ,-2.226125  , 2.1263433 , 4.114946  ,-4.196935  , 2.1586251 ,-4.8116245 ,-0.99921423, 4.3145165 ,-0.06300683, 0.9726341 ,-2.1688504 ,-3.915935  , 1.7099698 , 4.4308853 , 4.7915993 , 1.4655458 , 1.7440084 , 0.06360465,-3.3773756 ,-1.7748362 , 0.17255747, 4.977531  , 1.8433566 ,-1.667572  , 3.150487  , 4.2519865 , 0.25772935,-3.1341906 , 4.38715   , 1.1157507 , 3.988992  ,-1.0793406 ,-3.1062634 , 1.0461776 , 2.581822  , 0.07701357, 1.7032046 ,-1.719688  ,-1.7490327 , 3.2383342 , 2.3423147 , 2.0320241 , 4.7218304 , 4.2650065 ,-4.26582   , 3.6375117 ,-2.9219685 , 0.60033566, 4.8128977 , 4.293248  ,-1.03351   , 3.0293162 ,-4.0676146 , 1.5307257 , 3.0569024 , 4.741497  ,-1.6664437 , 0.94484675, 3.4286795 ,-3.0304806 ,-2.1725938 , 3.0652328 , 3.4135303 ,-0.5874611 ,-0.0110481 , 1.0779312 ,-2.2292843 , 0.45149755, 1.6158468 ,-4.477534  , 4.4338675 ,-3.5888722 ,-2.2577443 , 2.648855  , 4.4507165 ,-4.6514072 ,-0.74817985,-3.182327  ,-1.3589416 , 0.93689245,-2.5330613 ,-3.114144  , 0.40391582, 1.1717435 , 2.394531  ,-3.4209871 ,-1.3696115 ,-3.8207295 , 2.0000634 , 2.693987  , 3.086585  , 2.903521  ,-2.149884  , 0.44542593, 1.4966886 , 0.26367816, 3.4363923 ,-2.7479713 , 0.79156274,-1.642051  , 1.0039544 ,-3.848295  , 4.3903728 , 0.33243847,-3.2833455 , 2.353057  ,-3.1497164 ,-0.14659806, 4.085301  ,-2.2828193 ,-2.6476057 , 3.6578662 , 4.410953  , 0.24965906, 4.6081295 , 0.87406915, 1.6971318 , 4.4883027 , 2.3990798 ,-2.9472117 ,-0.1569035 , 0.75272334,-1.586017  ,-3.9895067 ,-1.9349185 , 2.3048878 , 0.46985063, 2.3396938 ,-1.79147   , 0.46238607, 3.7024033 , 0.15210733, 1.5247787 ,-2.6879177 ,-3.0745072 ,-3.1346188 , 1.622853  ,-4.6427855 ,-0.68014264,-3.4689806 ,-3.458772  , 4.605259  ,-3.3465903 , 4.0176997 , 0.62666076,-0.16934091, 4.0533366 , 2.9058437 ,-2.1753752 , 2.0530415 , 1.7414057 , 1.9485176 ,-4.6453176 , 0.5048718 ,-3.9436147 ,-3.154428  ,-1.4952016 ,-1.8423259 ,-2.7371721 ,-0.37776777, 2.222526  , 0.4938752 , 4.7000623 , 4.055448  , 1.7984202 , 3.8540864 ,-4.020227  , 3.6239922 ,-0.4930054 ,-3.1983838 ,-0.59420776, 0.6061396 , 4.241923  ,-2.3298786 , 2.1898527 ,-2.1658192 ,-1.9973361 ,-3.4951637 , 0.22367272,-1.0662649 , 4.90254   , 0.9198921 ,-2.7692912 , 0.11930244, 4.3874326 , 1.6891963 ,-3.9006917 ,-4.7475104 ,-3.1887932 ,-4.247136  ,-1.0733032 ,-3.4617836 , 2.7277462 , 2.8156383 , 1.7267603 ,-2.2093835 ,-2.3587527 ,-3.2521384 ,-4.2703094 ,-1.1650047 ,-2.5386038 ,-3.226933  ,-4.110442  , 4.928302  ,-1.9791938 , 1.951922  ,-0.05581883, 1.9885951 , 1.724876  ,-1.5001607 ,-2.4702833 ,-4.046387  , 4.598048  , 2.0017822 , 2.4606009 ,-3.0076113 ,-3.630138  ,-0.58256227,-1.9988314 , 1.7240179 , 2.5316033 , 4.7843785 ,-0.27961534, 0.45785254, 3.4268925 , 2.2096367 , 2.6645563 ,-1.963684  ,-4.9165344 , 2.946002  , 1.0085366 , 3.7448997 , 4.444165  , 0.20047304, 0.72959673,-3.989426  , 4.233162  ,-2.6953456 , 1.3067822 ,-3.7083473 , 3.799648  ,-3.0094013 ,-2.4605138 ,-4.863343  , 1.2970219 ,-1.4700948 ,-4.939425  ,-0.4917321 , 0.6425007 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/4.txt
new file mode 100644
index 000000000..ff3c5ace3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/channel/int16/4.txt
@@ -0,0 +1 @@
+ 0.77628356, 0.8742178 , 4.3841376 , 3.9455464 , 4.8370442 ,-4.5891595 , 2.8366191 , 2.6603813 , 3.2389634 ,-1.2136962 , 0.79639715, 0.25305036, 2.9633057 ,-1.2732202 ,-0.9941141 ,-4.6739984 , 1.2970364 ,-3.9004529 ,-2.495188  , 4.454323  ,-0.41007832, 2.0026298 , 2.1573522 ,-3.6922464 ,-1.0944914 , 3.94396   , 1.9214427 ,-0.44390342,-3.8700597 ,-0.43640822, 4.6144395 ,-0.5723506 ,-3.3060803 , 0.7521421 , 1.7805135 ,-1.3054347 , 4.5463753 ,-4.731837  , 3.0060315 ,-1.1449385 ,-2.6159992 , 4.3935895 , 1.2105472 ,-0.7257971 , 0.8602565 , 4.083583  ,-3.8691707 ,-3.131461  , 4.156716  ,-1.1995463 ,-3.1585817 , 0.82309705, 2.0970995 , 3.378476  ,-2.8005354 ,-0.7037894 ,-2.3273225 ,-4.640461  ,-1.9123144 ,-2.944773  ,-2.1782146 , 2.056767  , 1.7063855 ,-4.1916785 ,-1.4816947 , 1.5674059 , 1.7946502 ,-0.04801524, 3.684821  ,-1.2795266 ,-4.2824235 , 2.3105388 ,-1.7977144 , 0.598917  ,-0.13459285, 4.3482127 ,-0.39270914, 4.3593087 , 2.331621  , 1.5738527 , 1.365369  , 2.1118524 ,-3.1903272 ,-2.6296768 ,-1.7748582 , 4.4265466 ,-0.22214267, 4.2687926 ,-4.9356604 ,-4.735486  ,-1.2183895 , 4.505148  ,-0.51000804, 1.0697384 ,-4.4798536 , 3.481103  , 4.337101  , 2.3028882 ,-0.14744945,-0.09266943,-3.966115  ,-1.8844036 , 2.3948793 , 4.349734  ,-4.433698  ,-4.424103  , 0.5152364 ,-0.5438365 , 1.8132337 ,-2.065906  , 1.0651636 , 0.08070637,-1.3664498 , 0.8790827 ,-2.8489985 , 4.6403775 ,-1.7058657 , 4.744852  , 4.629411  ,-3.8179498 , 1.7875512 , 4.804663  ,-3.7588627 , 4.5770745 , 4.234062  , 4.4133935 ,-3.8471553 , 0.28418094, 3.103688  , 0.11558467,-4.9503083 ,-0.26066843,-4.038079  ,-2.5637898 , 0.13195343,-0.48607835, 0.49593657,-3.877949  , 4.033323  , 1.6440424 ,-3.3776283 , 4.429419  ,-1.5363382 , 0.96985877, 1.8553139 ,-1.0163096 ,-4.9978313 , 4.4859114 ,-1.1065072 , 4.84827   , 1.3259351 , 1.4972471 ,-2.8903277 ,-4.209779  , 4.445652  , 1.5648273 , 2.1782582 ,-2.53759   ,-0.41737884, 4.5770307 ,-0.4921347 , 3.4396465 , 1.9875554 ,-0.740242  , 3.6160038 , 1.9656229 , 0.4855264 , 0.35450923,-3.8319254 , 1.664986  ,-4.091312  , 4.332086  ,-0.59232306, 1.4464446 ,-3.7363563 , 2.524187  ,-2.8357584 ,-2.317436  , 3.5311913 ,-4.1268234 , 1.6923355 , 1.3935364 , 4.1926684 , 1.7765065 , 1.0287559 ,-4.145092  , 1.8842214 ,-0.39877102, 1.9611474 , 2.4198146 , 0.5241378 ,-1.844806  ,-1.5277894 , 0.22524206, 2.6451344 , 4.9797215 , 3.2711246 ,-2.1342309 , 3.568039  , 0.2390882 ,-0.553521  ,-4.588598  ,-1.620724  , 4.576343  ,-2.2504685 , 3.687249  ,-4.0313754 ,-4.729034  ,-1.0298516 ,-4.223112  , 1.8052602 ,-2.5888221 ,-4.5665293 , 3.9396172 ,-4.5181932 , 4.7747293 ,-3.143026  ,-3.6682124 ,-3.2871885 , 4.826698  , 2.5052974 ,-3.8402681 , 0.15495667, 1.5239644 ,-1.3660222 , 3.5605366 , 2.8165507 ,-0.57148635,-3.5490298 ,-1.4684753 , 3.8387637 ,-3.533492  , 1.1032137 ,-3.3146498 , 2.770136  ,-1.4327627 , 2.215508  , 0.08182216, 4.6495414 ,-0.83671314,-4.074062  ,-2.9050274 , 3.668077  ,-0.64422435, 4.610212  , 3.9310832 ,-2.5132718 , 3.3636646 , 4.2034388 ,-3.6327288 ,-1.4320643 , 1.3307538 ,-3.9999893 ,-1.4370643 , 0.24813278,-3.6330073 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/0.txt
new file mode 100644
index 000000000..182eb5290
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/0.txt
@@ -0,0 +1 @@
+ 3.4251418 , 1.8884782 ,-4.061519  ,-2.1329548 , 3.851976  , 3.668601  ,-0.7418167 , 2.379966  , 0.87259316,-3.96981   ,-4.627804  ,-3.3958297 , 3.025158  ,-1.299777  ,-4.322816  , 3.9173064 ,-0.55214256, 1.9224825 ,-4.8571157 ,-4.778045  , 3.3015614 , 0.56785774, 4.7985554 ,-0.4355816 , 4.9478025 , 1.7909397 ,-0.7620663 ,-0.09947702,-3.0230513 , 1.3817457 ,-4.5706887 ,-3.4097836 ,-4.7086477 ,-3.4651487 , 1.4401027 , 4.7513933 ,-1.0788624 ,-3.4946275 , 4.607974  ,-3.1215246 ,-1.4637078 ,-3.5266285 , 2.1268125 , 0.19458893, 4.058288  , 2.2452407 , 0.7575343 , 0.12213306, 4.885321  ,-1.2482406 ,-1.1034219 ,-4.054173  ,-3.6471267 , 4.774012  , 0.9450243 ,-2.5827825 ,-2.3991685 ,-2.8482654 , 0.9294943 ,-3.1165063 ,-1.6113516 , 0.04260086, 2.0987031 , 2.1601508 , 4.9740996 , 3.7719023 , 2.6817482 , 0.42131838,-1.4525859 ,-0.5124655 , 2.6313434 , 4.5606523 ,-4.6180778 , 4.788594  ,-0.8446551 ,-1.5460813 , 1.4288356 ,-1.9648911 ,-4.9766145 ,-2.405665  ,-0.30327383, 3.5204673 ,-3.848158  ,-2.6913974 ,-2.76141   , 4.336643  , 1.4205143 , 4.5898    ,-0.93183124, 4.2199287 ,-4.216924  ,-1.0979122 ,-2.3032405 ,-3.4457245 , 2.944412  , 2.137278  , 1.0326933 , 2.3116126 , 4.2138443 , 1.8283377 , 0.28901085,-1.8877143 , 0.50673705, 1.4360197 ,-2.924691  , 0.9819095 , 3.4656513 ,-2.541582  ,-1.9102442 , 3.3629627 ,-0.9675056 , 0.5937253 ,-2.4236617 ,-1.4193813 ,-0.7552614 ,-1.7121441 , 4.39647   ,-2.2712908 ,-4.3387337 , 1.5912663 , 0.8397044 , 0.17277755, 1.5272428 , 3.571715  ,-1.4471695 , 1.8623346 ,-4.3603377 , 1.2116091 , 4.960487  , 2.3681397 , 1.2925869 ,-4.3249073 , 2.4402251 ,-1.4506928 , 3.023616  ,-3.232099  ,-4.0106025 , 3.5774167 ,-0.6024932 , 1.0183483 ,-2.8215308 , 3.7395437 , 1.9100485 , 3.892712  , 4.6569633 ,-3.251774  ,-3.6923678 ,-4.8891983 ,-3.8605282 ,-4.0293036 ,-2.8199108 , 4.1668954 , 2.1569817 ,-2.9700332 ,-0.7035824 ,-0.5176811 ,-3.1826456 ,-3.334556  , 4.9103675 , 3.8513231 , 2.8609774 , 1.1845547 ,-1.4094447 ,-2.0445833 , 0.9833705 , 4.481276  , 3.83006   , 4.6240997 ,-4.268881  ,-0.85518706,-2.2650888 , 4.032545  , 0.9495817 , 1.1353155 ,-4.6551876 ,-2.2839146 , 2.6291692 ,-3.0398533 , 0.52652216,-1.8323399 ,-0.12300313, 0.46178594, 1.120684  , 1.4657134 ,-1.9794375 , 0.08941289,-4.4573083 , 2.7112565 , 4.9227715 , 2.4938288 ,-0.37153494,-4.1604757 , 4.7694197 ,-1.3021677 , 2.454714  ,-2.4902875 ,-2.760436  , 0.05183195,-2.6723208 ,-1.1471758 ,-2.2565122 , 0.20876396,-0.7288584 , 0.4386669 , 0.7846054 , 2.7294593 ,-3.836883  , 2.7501638 ,-4.775067  ,-3.2403855 ,-2.0307286 ,-1.6403166 , 4.9471517 , 1.0428456 , 2.5126355 , 3.0090203 ,-2.3476288 ,-2.9215205 , 3.8079188 , 0.83959275, 4.2670302 , 1.2338712 , 2.7329903 , 2.2549257 , 4.882931  , 0.12783106,-2.4392028 ,-2.4590807 , 4.2874207 ,-0.08333418,-3.4244132 ,-0.2235516 ,-4.23632   ,-1.3970895 , 2.1245553 ,-2.513883  ,-2.8092728 ,-1.9194845 ,-4.1932216 ,-3.7431748 ,-1.1063433 ,-3.714845  , 1.7230242 ,-0.19162221, 1.1123114 , 3.937181  , 2.6165597 ,-0.61531806, 0.44309503,-2.9260228 ,-3.1617007 , 0.0663496 , 2.4541974 ,-2.714474  , 4.2564497 , 1.2300675 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/1.txt
new file mode 100644
index 000000000..dd8037244
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/1.txt
@@ -0,0 +1 @@
+-4.8834    ,-4.6238756 , 2.020674  ,-2.3068821 , 3.7487323 ,-0.36079448, 0.08661745, 3.423143  , 3.3073757 ,-2.709357  , 4.4810205 , 3.4159606 , 4.1597505 ,-4.249789  , 2.3782206 ,-2.02848   , 0.90137833,-0.6249625 ,-3.5300052 ,-4.1113796 ,-3.768913  ,-3.59854   , 2.0896666 , 1.7677166 ,-2.3101497 ,-1.0116942 ,-3.7846713 , 2.4777756 , 3.413987  ,-2.1964507 , 0.08637846, 0.02552292,-1.9918599 , 0.7785565 ,-4.065995  , 0.8808776 ,-2.0446506 ,-1.8421272 , 0.42566776, 3.8834689 , 4.900111  ,-3.0617309 , 4.0613194 ,-3.3601153 , 3.678536  ,-4.1136184 ,-4.2903633 ,-2.6918027 , 3.4335177 ,-3.9272869 ,-1.6882807 ,-1.9629028 , 4.2125826 , 1.6536059 ,-1.1801353 , 4.8443203 , 2.9393198 , 0.4306524 , 4.390743  ,-4.6322317 , 2.932263  , 4.140538  , 2.7385068 , 2.620753  , 2.0725663 ,-1.3642436 ,-0.48539641,-4.2409816 ,-1.5950899 ,-1.688442  , 4.4769464 ,-1.25038   , 3.462903  , 0.5011836 , 0.981037  , 0.63532305,-3.4727957 , 4.6721544 ,-3.481392  , 2.8904114 ,-1.7057139 , 1.0501702 , 3.0799537 , 1.6698593 ,-1.3895478 , 4.487443  , 2.5352533 ,-0.19357985, 0.78166926, 3.5892236 ,-4.3259463 , 2.8381345 , 1.3652785 ,-0.40142608,-0.62102544,-3.088937  ,-4.0266094 , 4.7095647 , 2.0513067 ,-1.8115149 , 0.11062156,-4.5980725 , 2.809295  , 4.2042894 ,-3.4689455 ,-1.3418434 , 2.9026117 ,-1.6125411 , 2.153075  ,-3.4445221 , 3.4869678 , 1.8746428 , 0.8482056 , 3.0525062 , 1.715966  , 1.7684505 ,-2.0022326 ,-4.3427444 ,-3.1659825 , 1.6855526 , 3.1612136 , 2.0646648 ,-3.972224  ,-2.91726   ,-3.5450957 ,-2.7226381 ,-0.3273488 ,-2.5905557 , 3.6621993 ,-4.3285728 ,-0.6200474 , 0.08522832,-2.1981175 ,-3.4179437 , 2.5989106 ,-0.8503352 ,-3.3723786 , 3.9595454 ,-0.5431398 ,-2.6962373 , 1.9689399 ,-2.8925    ,-1.2064192 , 1.606632  , 2.2728612 ,-0.1403075 ,-4.8031726 , 0.1549256 ,-1.3698703 , 0.78889227,-2.286554  , 0.96417916,-0.10438658,-3.8131578 , 2.9322996 , 2.4103441 , 4.4864798 , 0.02176606,-1.1966147 ,-3.6921146 , 4.943659  ,-1.0050472 ,-1.2238564 ,-4.5758605 ,-2.6865735 , 1.7294792 , 4.180183  , 3.157911  ,-3.581904  ,-2.9112866 , 4.1674094 , 3.2326035 ,-2.7883985 ,-0.09154221, 0.8667318 ,-4.532571  , 0.816668  , 3.1307516 ,-4.1993947 ,-1.0503744 , 0.123965  , 0.17691068,-3.1465137 ,-1.4964765 , 3.4077635 ,-0.35415363, 1.9092371 ,-4.709203  , 1.148622  , 4.4766874 ,-2.193539  ,-3.7959206 , 1.4420112 ,-2.5300896 , 4.107192  , 3.4666913 ,-2.1158516 ,-3.182484  ,-2.8406513 ,-1.9396024 ,-2.3695247 , 3.8301885 ,-1.5032169 ,-0.48879272, 0.41695955,-1.1829228 , 4.822825  ,-2.9244933 ,-3.8178608 , 2.7742817 , 2.6998327 ,-3.1187122 , 2.508593  , 1.2989064 , 2.3436947 ,-0.39074868,-3.034766  ,-1.8690065 , 4.850296  ,-2.4549792 , 4.839528  , 2.2758777 , 2.6689568 , 3.2014422 , 3.6975234 ,-3.2566156 , 3.546554  , 1.9570364 ,-2.753807  , 2.3366053 ,-4.357898  , 4.9184504 ,-1.0057111 ,-3.8582199 , 1.2416974 , 4.355522  ,-2.7863925 , 0.4679685 , 2.6850772 , 2.9984746 , 2.434312  , 2.9931593 , 2.2637212 ,-0.18371914,-4.07688   ,-2.0402577 , 0.5173147 , 0.19596666, 4.71653   , 4.291663  ,-3.3575501 ,-1.0857964 ,-0.16504912, 3.6683955 , 2.9581416 ,-1.354989  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/2.txt
new file mode 100644
index 000000000..1295bfdba
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/2.txt
@@ -0,0 +1 @@
+ 1.2340723 ,-1.7371651 , 4.271641  ,-2.3332376 , 0.82301813,-3.4199295 ,-0.75806665,-2.2647665 , 2.613749  , 2.2658496 ,-2.1277714 ,-0.465433  ,-0.1323059 ,-1.9658507 ,-4.7780223 ,-4.392719  ,-0.81063855,-3.639001  ,-3.6398284 , 4.6309023 ,-0.17483327, 1.7921627 ,-1.1493484 ,-3.8145075 , 2.2367268 ,-0.40209827,-1.4159911 , 2.3032134 ,-4.154446  , 1.6760192 , 2.3430173 ,-1.386683  , 3.3363335 ,-2.976934  , 3.3983    ,-0.0069695 , 3.7025425 ,-1.8683758 , 0.72029626, 2.7558882 ,-4.4060984 , 2.553126  ,-3.5888321 , 1.8549582 ,-0.52258795, 4.6549897 , 0.8886988 ,-3.0400214 ,-3.6890693 , 3.6663766 ,-4.8026586 , 1.0636287 ,-2.9774907 , 0.39021772,-4.2414255 , 2.914968  ,-0.24334456,-4.0344954 ,-1.1011956 ,-3.8205252 , 0.05693521,-4.1379023 , 1.0584197 ,-4.0404034 , 4.841462  ,-1.2727845 , 2.6974225 ,-4.2507453 ,-2.7101111 ,-2.9800036 , 0.3082796 , 3.6763537 , 2.3277721 ,-4.9667864 ,-2.4498677 , 0.2704629 , 3.006634  ,-1.1129389 , 4.373073  ,-1.2066779 ,-3.1575904 ,-2.721046  ,-0.861226  , 1.7315729 , 2.255666  , 2.5448847 , 3.1268334 , 1.5189171 ,-3.1992466 , 0.607633  , 4.0749955 , 1.2546133 ,-1.5335796 ,-1.6200712 ,-3.9392874 , 1.053699  ,-0.87970537,-3.9218261 ,-2.2724128 , 0.82235074,-2.3400521 , 3.6467028 , 1.6891364 ,-1.6333519 , 2.2639709 ,-0.08272895,-3.076964  , 3.731091  , 3.7932968 , 2.496441  ,-4.12142   ,-2.0908666 ,-4.994248  ,-0.0429902 ,-4.6083336 ,-4.522535  , 4.717733  , 1.6715643 ,-4.779822  , 1.2919815 ,-4.6121325 ,-0.6206874 ,-2.6633883 ,-1.9632595 ,-3.2203329 ,-0.6556523 , 1.3083993 , 0.13287744, 4.599294  ,-1.1777852 ,-2.9159715 ,-0.25669238, 0.48217958,-3.9736347 ,-0.774503  ,-0.7264863 ,-3.0058725 ,-2.1682055 , 2.6579158 ,-4.4020653 , 3.0450368 , 1.3798735 ,-4.9858127 ,-4.5812607 ,-3.7349749 ,-4.4158583 , 1.631093  ,-3.0769646 ,-3.8406906 , 1.6544044 , 0.36895755,-1.8196682 ,-2.0880237 ,-3.708266  ,-2.0277069 , 1.0536597 ,-3.6726243 , 1.1704421 , 2.3201573 , 1.4994124 , 4.0197086 , 2.1001272 ,-0.39845964, 4.879206  ,-4.6042013 , 4.367211  , 2.2712052 , 2.7754369 ,-3.156667  , 4.349216  ,-4.111492  , 1.0267047 ,-2.3381946 , 4.8876834 , 4.876814  ,-0.28538027, 4.8861    ,-0.95963717, 0.46279734,-4.5789995 , 0.26168647,-0.8879058 , 2.4468584 , 1.3030591 , 3.7261188 , 3.9933589 , 2.4964094 ,-1.3851117 , 0.7147012 ,-3.8367457 , 0.79737735,-0.5907085 , 4.317288  , 0.7659837 ,-4.821792  ,-1.466433  ,-1.147227  ,-1.8638811 , 2.5115767 , 1.9449657 ,-2.4122007 ,-2.4968379 , 0.7738737 ,-1.4761454 , 4.131583  , 0.4211128 ,-2.4312468 ,-1.9722428 , 2.2810268 , 4.950381  ,-0.0406047 , 4.67312   , 0.66613483,-0.28880936, 3.2917845 , 1.6225572 , 4.809879  , 0.48241946,-3.654634  , 0.68542016, 1.3973923 , 3.479005  ,-1.4296091 , 0.64391786,-4.0887494 ,-2.186845  ,-4.5834355 ,-0.67726034, 2.4158256 ,-2.4787726 , 0.4353257 , 2.9205139 , 0.10488439, 2.0790074 ,-4.5518365 ,-3.3856661 , 3.940736  ,-1.7141095 ,-4.8946457 , 1.1085542 , 3.785141  ,-2.4175835 , 3.7720537 , 4.623048  , 2.2239215 , 0.11616404, 0.09229392,-3.637964  ,-2.334849  ,-0.95000714,-2.1338253 , 3.2281857 ,-4.0220475 , 4.7304025 ,-1.8075961 , 0.2428817 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/3.txt
new file mode 100644
index 000000000..378b5fea5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/3.txt
@@ -0,0 +1 @@
+ 2.4605505 ,-2.7001262 ,-4.3874917 ,-2.9867616 ,-3.4332    , 0.76675916, 3.4377892 ,-0.6712793 , 1.8018581 , 1.8148962 , 2.0353577 ,-4.766427  , 3.2487285 , 3.886249  ,-2.8867183 ,-0.7906634 ,-4.376028  ,-4.2085958 ,-0.36025277, 0.6360799 ,-4.687723  , 4.8313313 , 3.3582768 , 2.1117954 , 0.9821817 , 3.3697798 ,-1.1784939 ,-3.1590316 ,-0.24019621, 0.20640443, 1.2808957 , 2.3346424 , 2.13951   , 0.61864626, 2.4020443 ,-1.9671458 ,-1.6852348 , 0.32225233,-2.3928862 ,-4.173372  ,-2.282281  ,-1.271318  , 3.0839682 ,-4.4726086 ,-0.635177  , 3.2710915 , 3.08071   ,-0.7311931 , 2.1444874 , 0.4102332 ,-3.332888  ,-4.8965516 , 3.903695  , 1.4920163 ,-4.041926  ,-0.3941788 , 3.6352818 ,-2.098405  ,-0.9248165 , 2.6277795 , 3.225142  ,-1.4461963 ,-4.2050753 ,-0.2213572 , 1.9704323 , 3.298732  ,-4.710403  , 3.6876736 , 2.0771818 , 1.3559113 , 1.328373  ,-4.4079022 ,-3.28067   , 3.8852313 , 2.322237  , 2.3243637 ,-1.9126451 , 4.6277676 , 1.7031307 , 0.74861574,-4.688967  , 3.9351206 ,-1.8054084 , 1.5824287 , 3.5381088 , 2.4798677 ,-3.3099444 ,-3.8518245 , 1.5562242 ,-1.9466928 , 0.08375791,-0.16754703, 2.9265418 ,-1.6599798 , 2.766202  ,-2.8269696 ,-0.19389874, 2.0869334 ,-1.5073173 ,-3.2024453 ,-3.6522708 ,-4.588111  ,-2.3425827 , 4.8709297 ,-1.4231887 , 1.0590451 ,-1.6406479 , 0.37192422, 0.7313186 , 0.3865313 ,-4.2832613 , 3.9712496 , 0.07653506, 0.2593589 ,-2.6036396 ,-0.45185068, 3.6537335 ,-0.6341783 ,-0.6381408 ,-1.0992868 , 2.766365  , 4.666631  , 4.416099  ,-3.6654727 ,-4.0626607 ,-3.4928396 ,-0.6944366 , 4.869798  , 4.2240977 , 0.9655519 ,-2.5654511 , 1.3396966 ,-3.7639391 ,-1.2369057 ,-3.7242758 ,-0.5189227 , 1.6548159 ,-2.6197302 , 4.2732763 , 2.239486  ,-4.316255  , 3.2419755 ,-1.9283817 , 0.22489135, 2.6034477 , 0.15818155, 2.0811818 , 0.836994  , 2.7832468 ,-0.68581384, 0.89475006,-3.1455147 ,-4.818614  ,-4.1738377 , 0.4281551 ,-2.935886  ,-3.7582467 , 0.58168256, 0.2854076 , 1.0492616 , 2.2415884 ,-4.4923434 ,-3.2479804 , 3.8439462 , 3.9802108 ,-0.9027783 , 1.7783072 ,-2.2782066 , 4.4638705 , 4.28735   , 4.291463  , 1.1685107 , 1.2765578 ,-3.7954235 ,-3.494621  , 4.4340134 ,-3.5995178 ,-4.3025713 , 3.3037348 ,-3.6675146 ,-1.7871013 ,-1.2922373 , 0.72924066,-4.7065907 , 2.1388702 , 2.3570008 , 3.9203117 , 0.07483537,-2.8389792 ,-1.795164  ,-4.380931  , 1.3189598 , 2.4404252 , 4.4774084 ,-1.2798066 ,-4.95842   , 1.8095461 , 4.2692375 ,-2.0918155 , 0.33083543,-3.794544  , 1.4940621 ,-3.9446015 ,-0.38208306, 0.30863285,-0.6832849 ,-2.5675633 ,-4.948772  , 1.5904989 , 3.0415509 ,-4.899339  , 0.9415345 ,-0.91124976, 4.4849253 ,-3.4605968 , 1.6737833 , 1.9091597 , 1.3111106 , 2.0829957 ,-2.1308084 ,-2.912219  , 1.1306196 , 2.231948  , 4.7522073 ,-2.1438766 ,-2.1000512 ,-0.2984778 ,-1.2093959 , 2.6259391 , 1.8113437 ,-4.137133  , 2.716111  , 3.4318748 ,-0.89123845,-3.70718   , 2.453927  ,-0.22418758,-3.098459  ,-4.4986243 , 0.85048616, 2.8023102 , 3.743153  , 0.9931644 , 3.8588202 , 1.7585737 ,-4.2855363 ,-2.5475764 ,-0.83141845,-1.9358089 , 3.1711586 , 2.4221613 ,-1.881327  ,-3.7230873 ,-4.55259   ,-0.42294836, 4.64625   
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/4.txt
new file mode 100644
index 000000000..339435425
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mean_000/layer/uint8/4.txt
@@ -0,0 +1 @@
+-3.37344313e+00, 2.78325319e+00,-7.30300546e-01, 1.33456266e+00, 3.96648932e+00, 4.33421373e+00,-3.11558557e+00,-3.64659280e-02,-1.73589993e+00, 4.81018400e+00,-8.32905114e-01, 2.33330703e+00, 1.85830116e+00,-4.60395622e+00, 5.26070774e-01,-4.71355534e+00,-2.97202754e+00, 3.57638383e+00, 4.50985909e+00, 2.08423686e+00,-1.85349309e+00,-2.18306184e+00,-4.65403509e+00, 4.31280661e+00, 1.16069472e+00,-4.85344124e+00, 8.40563923e-02,-1.98723459e+00,-4.29561710e+00,-2.57372570e+00,-4.22641230e+00,-4.00811911e+00,-9.61861551e-01,-2.14665198e+00, 4.18120289e+00,-3.87826174e-01,-2.86187083e-01,-4.84979200e+00,-1.34733701e+00, 1.27489030e+00, 1.98844969e+00,-4.11230135e+00,-1.61191213e+00, 2.63515592e+00, 4.35539484e+00,-1.56582773e+00,-2.45283508e+00, 1.44556177e+00,-8.56053472e-01, 3.25111747e+00, 3.58699083e+00,-2.47732449e+00, 3.64130282e+00,-4.91288567e+00, 8.97059917e-01,-2.26010180e+00, 4.91831064e+00, 4.45047706e-01, 1.88655663e+00, 3.20642543e+00, 1.38243341e+00, 9.06112790e-01, 1.15262544e+00,-2.39862514e+00,-2.87477684e+00, 7.36831248e-01, 3.18799114e+00, 1.22698748e+00, 5.63625395e-01, 1.29130912e+00,-4.89572334e+00, 2.11258578e+00,-4.55420208e+00, 4.94569272e-01,-7.08617330e-01,-1.84863120e-01,-4.81965256e+00,-1.06512284e+00, 4.79633398e-02, 2.70429182e+00, 4.78289175e+00,-2.11806059e+00, 4.23046875e+00, 3.18022132e+00,-8.39496255e-01, 3.13150501e+00,-3.24103773e-01,-7.48505890e-01,-2.45754886e+00, 4.16639376e+00, 3.25864077e+00, 3.40006447e+00,-3.77217412e+00, 2.93266010e+00, 3.33685803e+00, 1.02347994e+00,-2.22839618e+00,-1.90375733e+00, 3.24283957e+00,-4.01684284e-01,-4.45417643e+00, 3.74440104e-01, 3.33520865e+00, 6.64106190e-01, 3.84395885e+00, 2.38586918e-01,-1.51634857e-01,-2.64977455e+00,-3.45786500e+00, 4.89002228e+00,-1.07323432e+00,-2.92749858e+00,-1.76510501e+00,-3.44604325e+00,-1.89681911e+00, 4.20239258e+00,-1.75864971e+00, 2.13181686e+00, 3.90355319e-01,-4.11911535e+00, 6.61891177e-02,-4.32988214e+00,-1.42876351e+00, 3.12163901e+00,-4.56227779e+00, 4.17938662e+00, 9.63881195e-01, 4.35952139e+00, 1.61931109e+00, 4.11196423e+00, 2.25612569e+00,-4.77538586e+00,-1.72600198e+00,-4.39411783e+00,-8.98730099e-01,-1.04562032e+00,-2.81517529e+00, 3.57167959e+00, 1.90318239e+00, 2.17302442e+00,-3.79942179e+00, 2.19838643e+00,-4.16209459e+00, 4.45025682e+00, 1.68786839e-01,-2.56879544e+00, 3.60925221e+00, 1.06542781e-01,-3.48755455e+00,-6.77028894e-01,-3.51582170e+00, 3.90697241e+00, 4.49116230e+00,-1.56180394e+00, 4.96249914e+00, 9.63374436e-01, 2.72304177e+00, 8.38046610e-01,-2.91993833e+00,-9.41783428e-01, 8.00800502e-01, 3.89176035e+00, 6.70560122e-01, 2.76782703e+00,-1.37075472e+00,-3.25303817e+00,-4.41226482e+00,-8.38777184e-01, 1.73568249e+00,-1.09438455e+00,-1.08815920e+00, 1.06787062e+00, 2.04415274e+00,-2.93027782e+00,-6.86941504e-01, 3.83109421e-01,-3.49270535e+00,-2.13225913e+00,-3.61786675e+00, 1.32213378e+00,-2.89654016e+00, 4.23944092e+00, 4.53665400e+00, 4.26081800e+00,-1.95718706e+00, 4.72295076e-01,-3.08592963e+00, 2.53354859e+00, 3.80069661e+00,-1.14408419e-01, 2.39438844e+00,-4.73618507e+00, 2.35079074e+00,-1.43686843e+00, 1.32946157e+00, 1.10381134e-01,-3.49878430e+00, 2.83181930e+00, 4.57872486e+00, 2.29953095e-01, 7.19881415e-01,-2.97208834e+00, 4.11286211e+00,-3.89149117e+00, 3.83631349e+00, 4.14627981e+00,-1.14082299e-01,-6.89825296e-01,-2.55468488e+00,-4.04466152e+00, 9.95541453e-01,-2.59181118e+00,-4.60567427e+00,-4.77339029e+00,-7.36041367e-02, 1.85957468e+00,-3.42530179e+00, 4.55782986e+00,-3.29603004e+00, 3.55632234e+00, 2.40858841e+00,-2.07399082e+00,-3.96705031e+00, 4.41718817e+00, 3.19581985e+00,-3.72379017e+00,-3.76826024e+00, 6.79764748e-01,-4.43838930e+00, 2.29627752e+00, 2.34923697e+00,-4.23308420e+00, 3.80186272e+00, 8.65862250e-01, 8.44927967e-01,-1.05974531e+00, 4.70531940e+00, 1.25060010e+00, 4.82314730e+00,-4.53093815e+00, 4.51410580e+00, 4.95166332e-01,-3.45584202e+00, 1.82002666e-03,-3.27616286e+00,-2.68104935e+00, 2.39554620e+00, 2.99364328e+00,-2.57998848e+00,-4.35891914e+00, 4.64737415e+00,-5.74958742e-01, 6.47293210e-01, 1.85961032e+00, 4.49567413e+00,-4.36166048e+00
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/0.txt
new file mode 100644
index 000000000..2f14c1e05
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/0.txt
@@ -0,0 +1 @@
+-1.6401564 , 1.5341219 ,-1.6809373 ,-0.23482142, 1.0358882 , 4.1831236 ,-1.1044886 ,-3.6126914 , 3.4088228 , 0.7250639 , 4.935399  ,-1.0157012 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/1.txt
new file mode 100644
index 000000000..6d7e58528
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/1.txt
@@ -0,0 +1 @@
+-0.8402546,-1.1927719,-1.0746883,-4.480075 , 2.5831218, 3.5992115,-4.108747 ,-1.5075738,-2.004373 , 4.8728533,-4.5553994,-2.8098056
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/2.txt
new file mode 100644
index 000000000..b4e7cda03
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/2.txt
@@ -0,0 +1 @@
+-3.056717 , 4.2170177, 1.4303325,-4.606602 , 2.3446174, 3.184377 , 2.3996902,-2.0212016,-4.653397 , 4.8631716, 1.8641028,-2.1063576
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/3.txt
new file mode 100644
index 000000000..0bf886f56
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/3.txt
@@ -0,0 +1 @@
+ 1.9526653 ,-4.830058  , 1.2988665 , 2.9302614 ,-1.628051  , 3.9963005 , 2.2121394 ,-0.54135066, 2.6000595 , 4.6699815 , 1.7348015 ,-2.545231  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/4.txt
new file mode 100644
index 000000000..944e41e36
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/channel/int16/4.txt
@@ -0,0 +1 @@
+-3.865225  , 2.0706809 , 2.0491793 , 2.2364922 ,-2.5974188 ,-1.8711575 ,-2.917702  ,-0.9958814 ,-0.18635549,-2.7875533 , 1.4581085 ,-2.5568976 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/0.txt
new file mode 100644
index 000000000..e580d6f85
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/0.txt
@@ -0,0 +1 @@
+-4.024665 , 3.0544488,-4.5645285,-3.2134292,-2.1543078, 4.039755 ,-4.613908 , 4.2014904, 3.8222141,-4.4992657,-4.02681  ,-3.2933445
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/1.txt
new file mode 100644
index 000000000..c593dfbb6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/1.txt
@@ -0,0 +1 @@
+-2.669042  , 2.479217  , 4.691815  , 1.8187722 ,-3.7656548 ,-2.0555806 ,-2.4494352 ,-3.2394514 ,-0.38215363,-1.543695  ,-0.6927158 , 2.3534324 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/2.txt
new file mode 100644
index 000000000..14520a177
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/2.txt
@@ -0,0 +1 @@
+ 4.036224  ,-1.2903051 , 1.2116423 , 3.92255   ,-0.48049024,-1.0290806 ,-0.9644837 , 1.3379688 ,-1.0027533 ,-1.9611529 , 3.7190473 , 0.45794436
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/3.txt
new file mode 100644
index 000000000..2238d5e9e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/3.txt
@@ -0,0 +1 @@
+ 4.560488 ,-1.2475324, 1.8892838,-2.0155866,-4.968927 , 0.3717404,-0.6095849, 3.2483344,-1.2499679, 1.4237018,-3.1225715, 3.0611598
diff --git a/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/4.txt
new file mode 100644
index 000000000..14a91ccc9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Mul_001/layer/uint8/4.txt
@@ -0,0 +1 @@
+-1.7167594, 2.116633 ,-1.3816848,-1.7106141,-3.273076 ,-4.148302 ,-2.1654181, 0.4368236, 3.4279666, 1.2954224, 1.3004405,-4.3022   
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt
new file mode 100644
index 000000000..bffcf1477
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/0.txt
@@ -0,0 +1 @@
+-2.6855478 ,-1.0063782 ,-0.05736925,-3.5160403 ,-1.1558081 , 3.615521  ,-2.8372238 ,-0.6269437 , 0.49917883,-4.913136  , 1.5265877 , 4.974778  , 3.3098187 ,-0.09241077, 3.5081398 , 0.0505144 ,-0.6729551 ,-4.4237547 ,-4.966356  , 4.141383  ,-1.2096795 ,-1.4394493 , 1.5831724 ,-2.8024888 , 2.545578  , 2.6052134 ,-0.22428347, 4.1437554 , 2.4520326 ,-3.2729409 ,-1.5977669 , 3.436161  , 1.9117191 , 2.0326712 ,-1.8667631 , 1.0792333 , 3.1582525 , 0.6821356 , 4.374157  ,-0.5893812 , 1.6538872 , 1.6130942 ,-3.498988  , 3.2384932 , 0.07478751, 3.957244  , 2.799743  ,-3.4798396 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt
new file mode 100644
index 000000000..48a56eb27
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/1.txt
@@ -0,0 +1 @@
+ 1.2590754, 4.442343 ,-4.0991793, 2.0638733,-3.4404948,-2.646043 ,-1.7430832, 4.3756185, 2.2659435, 4.9466286,-1.80044  ,-2.531546 ,-3.7237234, 4.3929725, 4.8750644, 2.4617221, 4.887943 , 1.5641859,-2.610336 ,-4.410633 ,-2.5240743,-4.5204134, 4.1221995,-2.2482948, 4.2502975, 3.3698628, 1.0758704, 4.9336023, 1.1999178,-2.875248 , 3.1566763,-0.626414 ,-4.6683826, 4.4540606,-4.2442794,-1.7926724,-2.242487 ,-0.5315291,-0.0130378,-3.6322727, 2.6875575,-2.4005275, 2.9970996, 1.0781384,-3.0705473, 4.303309 , 4.6578   ,-1.7002223
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt
new file mode 100644
index 000000000..f93010fe9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/2.txt
@@ -0,0 +1 @@
+-4.368311  , 0.13439946, 0.24416327, 1.5211624 , 1.7509546 ,-4.778414  ,-1.5761154 ,-0.8055587 , 4.025158  , 2.5964646 , 1.4061503 , 4.0578322 , 1.1699337 , 4.7441864 ,-1.3572135 ,-4.225252  , 2.578213  ,-1.1911626 ,-3.538451  , 0.177016  ,-4.8895907 ,-0.39637035,-1.802691  , 1.7761022 , 1.2734097 , 0.31743827, 2.686107  , 0.7306774 ,-4.460011  , 4.1779523 , 3.7888825 ,-1.4841243 ,-2.8362072 , 1.2245483 , 1.7972459 ,-3.4700866 , 4.320854  ,-4.5425787 ,-3.7104082 , 3.0125594 ,-2.0490885 , 2.795659  ,-1.84546   , 2.135244  ,-0.15280259, 1.4062694 ,-3.8837774 ,-0.75812423
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt
new file mode 100644
index 000000000..db2b6edd8
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/3.txt
@@ -0,0 +1 @@
+-2.8243983 ,-4.4524055 ,-1.0317615 ,-1.1988745 ,-3.2368467 , 4.609625  , 4.2858357 ,-2.6587856 , 0.38659486,-3.3237243 , 3.0585716 ,-3.260465  ,-4.9664984 ,-2.1594415 ,-3.934371  , 2.1108305 ,-4.30752   , 1.8228706 ,-1.2307801 ,-2.1066875 , 2.4061344 ,-3.1168818 , 1.6208204 ,-4.468139  ,-0.02087302,-2.7939155 , 2.927627  ,-0.41147447, 0.14678593,-3.6301854 ,-3.0083933 , 1.9953763 , 2.6338673 , 1.8736236 , 2.3656332 ,-3.8076937 ,-0.29208612,-4.7813087 ,-3.6228116 , 4.764327  ,-3.261239  , 3.5934968 , 0.93079615,-0.8640369 , 3.0465791 ,-2.7058053 ,-3.1439428 , 1.197273  
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt
new file mode 100644
index 000000000..fc7ec3c12
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/channel/int16/4.txt
@@ -0,0 +1 @@
+-3.8762918 ,-0.23988086, 4.05123   , 4.471294  ,-1.2950295 , 4.0446515 , 1.4410477 , 4.087779  ,-4.366653  ,-1.1168864 , 0.6057049 , 2.1632779 ,-3.588556  ,-3.8958907 , 4.7070394 ,-2.9224374 ,-3.914126  , 1.7182319 ,-1.897104  ,-3.5838506 , 4.097744  ,-4.6861534 , 4.3335524 , 3.4057853 , 1.0159183 ,-2.026307  ,-3.1870387 , 2.874219  ,-4.3282895 , 3.3507135 ,-2.3833654 ,-3.4183152 , 0.15211865,-0.482133  , 1.6552299 , 2.4949796 ,-4.837721  , 3.7088242 , 1.3266064 ,-1.3987536 , 3.7350404 , 0.80048573,-4.469703  ,-1.0222132 ,-1.3030492 ,-2.8426054 , 1.4573859 ,-4.8017726 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/0.txt
new file mode 100644
index 000000000..1f2993269
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/0.txt
@@ -0,0 +1 @@
+-3.3436873 ,-0.79453826, 2.2211137 , 2.6420908 ,-1.3191302 , 1.2973647 ,-4.506594  , 4.867371  ,-4.318404  , 1.6957753 ,-4.3091793 ,-3.2230556 , 4.9175825 ,-3.1527104 ,-2.6669753 ,-2.1135337 ,-3.7701926 ,-3.358504  ,-4.419803  , 3.2045574 ,-0.5828494 ,-3.5796826 ,-4.0088696 ,-4.7178082 , 2.2726505 , 2.1860175 , 3.7198956 ,-0.5788681 ,-3.7766652 ,-0.65016747, 3.707159  ,-2.240267  , 4.5772953 ,-0.54754776, 4.7143884 ,-3.196982  ,-3.6356654 , 3.7157805 , 3.1312432 , 0.58816016, 2.1710336 ,-1.600533  ,-3.689763  , 4.322089  , 0.4816874 , 2.2769346 ,-3.9072733 ,-0.58615017
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/1.txt
new file mode 100644
index 000000000..a19ea6696
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/1.txt
@@ -0,0 +1 @@
+-1.275483  ,-3.6622071 ,-0.87433696, 0.60946655, 1.4415421 , 3.3705983 , 2.2635043 , 3.3926573 ,-0.2936643 ,-0.5169573 , 3.2535644 , 2.1269164 ,-3.4180303 , 1.0427854 ,-1.3514856 , 3.6084783 , 4.569944  ,-0.79272085, 2.9771423 ,-1.6668562 , 4.8700657 , 0.3355385 , 0.76509756, 3.5142152 ,-1.6743544 , 4.794434  ,-2.958765  ,-0.23857778, 2.4555902 , 2.459867  , 3.3922994 ,-4.350212  , 0.6286153 , 0.8139546 , 4.1676807 ,-3.3461437 , 0.69633776,-4.6548877 , 0.98267466,-4.508397  ,-1.4581255 ,-1.2289628 , 3.8701873 , 3.334336  ,-3.5611253 , 2.6133575 ,-1.0554558 ,-3.3291767 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/2.txt
new file mode 100644
index 000000000..7113eb52e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/2.txt
@@ -0,0 +1 @@
+-0.6250365 ,-4.798417  ,-4.214081  ,-3.625409  , 2.4391694 , 4.1856265 , 3.2472587 ,-3.20996   ,-2.3537548 , 1.3749354 , 2.5947835 ,-1.8891864 ,-3.612735  , 2.246563  , 1.2701501 ,-2.8927476 ,-0.71078295,-3.6037376 ,-4.5916877 , 2.0044398 , 3.4437728 ,-1.0695096 , 4.3483944 ,-3.3387017 ,-0.9384242 , 1.4229002 ,-0.6568144 , 1.1164346 , 1.7145283 ,-2.596518  , 4.6728883 , 3.4737296 , 1.7935314 , 3.1263895 , 1.3614839 ,-3.824968  ,-3.0405738 , 3.1729462 ,-4.1985774 ,-2.9489865 ,-4.2080064 , 2.0368521 ,-2.858539  ,-0.03206728,-1.1123812 , 0.2994737 , 1.6906137 ,-0.8665008 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/3.txt
new file mode 100644
index 000000000..afeb2c0e6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/3.txt
@@ -0,0 +1 @@
+-4.5279946 ,-3.4497826 ,-2.058617  ,-0.39549035,-0.26672208, 3.0173857 , 3.2430282 , 1.9996022 , 1.3895315 , 1.7620904 ,-4.9040093 ,-3.2858686 ,-2.2823575 ,-1.4176623 ,-0.537347  , 0.68219584,-3.193989  ,-3.1675165 , 0.47214374,-4.390378  ,-1.8730192 , 1.4416525 ,-3.0460286 ,-0.73547626, 1.8686327 ,-0.8146671 ,-2.0906649 , 0.01226121,-0.06992937, 0.9302521 ,-2.1858516 , 4.8370657 ,-4.1847024 , 4.4963436 ,-1.3834711 ,-1.1244944 , 0.4290957 ,-4.2681174 , 1.2978764 , 3.4149706 ,-2.7011304 ,-3.1285405 ,-3.8857136 ,-0.18625297,-0.13618916, 2.427405  ,-1.7979074 ,-1.4174187 
diff --git a/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/4.txt
new file mode 100644
index 000000000..99c6284d6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/PRelu_001/layer/uint8/4.txt
@@ -0,0 +1 @@
+-0.40635094,-2.485209  ,-2.9641154 , 4.09174   ,-1.9137962 ,-2.0860991 , 1.6594787 , 0.53744185, 1.7737653 ,-1.7054961 , 2.5611186 ,-1.1456238 , 2.741241  ,-2.283051  ,-4.2111306 ,-0.8722772 , 1.6465468 ,-0.61518955, 0.08495517, 3.6847656 , 3.7826371 , 2.0023444 ,-3.5326133 , 2.3723035 , 3.7383325 ,-3.3514297 , 2.031452  ,-0.7364658 ,-4.3347225 ,-2.8146286 ,-1.37377   ,-3.518721  ,-0.19657679,-1.6831368 , 1.2457223 , 0.25099897,-4.4722757 ,-4.135197  ,-0.6378818 , 3.8833187 , 1.9291897 , 2.5969315 , 2.146067  ,-2.846719  ,-2.2562532 ,-2.6856182 , 2.824374  , 2.3662992 
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/0.txt
new file mode 100644
index 000000000..43cf02238
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/0.txt
@@ -0,0 +1 @@
+-3.7802553 ,-1.9713228 , 2.5057538 , 2.0533051 ,-0.4372419 , 4.400527  ,-2.53668   ,-3.636254  ,-3.6546571 , 2.1307952 , 0.81209564,-2.4923725 ,-0.32200927, 4.5844235 , 1.4608113 , 1.5916203 , 1.1521534 ,-1.6989231 
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/1.txt
new file mode 100644
index 000000000..0eacf50b6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/1.txt
@@ -0,0 +1 @@
+-1.4361136 ,-4.975774  , 4.1094494 , 1.032158  ,-0.04814574,-2.422229  , 4.4187384 , 2.1794112 , 2.4763412 ,-1.2492582 ,-4.5920935 ,-4.0279813 ,-4.1276155 , 2.1920047 , 1.3363422 , 1.711838  , 1.1330673 ,-4.929005  
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/2.txt
new file mode 100644
index 000000000..1cf684299
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/2.txt
@@ -0,0 +1 @@
+-1.0732957,-1.0392259, 2.789581 , 0.8985455, 3.40924  ,-4.615564 ,-2.5503623, 1.6059098,-4.5399165, 2.4623697, 4.5237975,-0.9411976, 4.4180136,-4.4564066, 3.2571127, 2.2749462,-4.565914 ,-3.9453325
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/3.txt
new file mode 100644
index 000000000..c3cfb5569
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/3.txt
@@ -0,0 +1 @@
+-4.9445686 ,-0.12239262,-2.5453374 ,-1.2110028 ,-0.79905856, 3.981249  , 2.694105  ,-3.174401  ,-0.9479676 ,-3.8547504 ,-3.9985576 ,-0.4692157 , 1.1693578 ,-3.0193985 , 3.1290145 , 3.1759324 , 2.9510665 ,-3.6862066 
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/4.txt
new file mode 100644
index 000000000..82609cfc9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/channel/int16/4.txt
@@ -0,0 +1 @@
+ 3.335564  ,-1.8178905 ,-4.7109857 , 3.3468652 , 3.6460853 ,-0.03393465, 3.7895339 ,-4.4296284 , 1.8252304 ,-2.2677863 ,-2.6210299 , 0.12556647,-1.0245817 , 1.2620107 , 4.8219824 ,-3.124949  , 0.3166363 ,-3.4773922 
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/0.txt
new file mode 100644
index 000000000..eb058a1c3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/0.txt
@@ -0,0 +1 @@
+-0.55411166,-4.1992335 , 1.4317423 ,-3.7261302 , 1.151971  ,-2.117022  ,-0.7386241 , 4.654951  , 1.4869142 ,-4.6252975 ,-3.305923  , 3.632628  ,-2.6403873 ,-4.862389  , 3.477561  ,-4.9842925 ,-3.6267536 , 4.9950438 
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/1.txt
new file mode 100644
index 000000000..ff15f032d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/1.txt
@@ -0,0 +1 @@
+ 0.18094282,-0.58095986, 1.2765085 ,-0.534363  , 4.5564513 ,-0.28305855, 0.80606604,-3.3217795 ,-0.08041744,-3.7558215 ,-0.5370528 , 1.8984528 ,-0.09462419,-0.28595117, 4.6817894 ,-4.6653147 ,-4.127137  ,-2.3407753 
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/2.txt
new file mode 100644
index 000000000..e564168bf
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/2.txt
@@ -0,0 +1 @@
+-0.62747055, 1.4133646 ,-0.9954612 ,-4.687624  ,-2.5390003 ,-4.534569  ,-1.1943612 ,-4.830596  , 4.3214984 ,-2.4795794 , 4.166298  ,-1.4772589 ,-4.074577  , 3.2332711 ,-1.5221404 ,-1.7308865 , 0.06814837, 2.944668  
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/3.txt
new file mode 100644
index 000000000..c763b6311
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/3.txt
@@ -0,0 +1 @@
+-3.2136867 , 0.6229863 , 0.02772082,-0.00820862,-2.4893622 ,-0.6757174 ,-2.2024722 ,-2.0893583 , 0.33953062,-3.5438979 , 0.7000838 , 1.3219849 ,-0.02302017, 2.3125873 ,-1.5376673 ,-4.0330076 , 4.755884  , 2.729685  
diff --git a/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/4.txt
new file mode 100644
index 000000000..12e13272d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/ReLU_000/layer/uint8/4.txt
@@ -0,0 +1 @@
+ 0.82922786, 4.762074  ,-3.5043278 , 2.4521468 , 2.6450796 ,-2.8606322 , 0.8321993 ,-1.4020495 ,-0.25749585, 1.0287803 ,-3.911455  ,-1.8311876 , 2.763438  , 3.8604703 ,-3.5478592 ,-4.2335987 ,-3.6402035 ,-1.8485361 
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/0.txt
new file mode 100644
index 000000000..a0462c946
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/0.txt
@@ -0,0 +1 @@
+ 2.4324646 ,-0.72821015, 2.7450123 , 1.1036391 ,-0.43891   ,-4.863224  ,-3.2931955 ,-2.0340252 , 4.284326  , 0.27745003, 3.761442  ,-4.3499503 ,-4.7881107 , 4.5609903 , 4.6533604 ,-4.5404058 , 2.5523062 , 0.04669883,-0.86533225,-2.082092  , 0.83845824, 2.774215  ,-2.0169477 ,-0.49454054, 3.1055443 ,-4.750468  ,-1.055414  ,-4.213197  , 3.6063917 ,-3.1573813 ,-0.776909  ,-3.001087  
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/1.txt
new file mode 100644
index 000000000..af7247264
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/1.txt
@@ -0,0 +1 @@
+-3.572596  , 3.6327808 , 4.4630327 ,-3.3366838 , 4.207919  , 2.4960644 ,-2.7017984 ,-3.7505877 ,-1.618726  , 0.26884264,-4.879002  ,-4.262543  , 3.6506212 ,-4.016184  ,-4.777153  ,-2.1727114 ,-3.4873834 ,-4.6051025 ,-4.858286  , 2.0302868 ,-2.410233  , 0.10019613, 0.63945997, 3.9416385 ,-3.2295997 , 0.905913  , 0.41592363, 2.6739645 , 2.8928993 ,-3.4646466 , 3.2302775 , 4.776366  
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/2.txt
new file mode 100644
index 000000000..8eeedd0d8
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/2.txt
@@ -0,0 +1 @@
+-2.461527  , 4.5492992 ,-1.7411181 ,-4.339588  ,-4.57271   ,-4.8995566 , 1.8722419 ,-1.8143005 ,-0.349418  ,-1.1475405 ,-1.5390202 ,-2.6817439 ,-2.1467986 ,-3.6956887 ,-0.28721192,-2.7982469 , 2.455128  ,-4.1546254 ,-0.569284  , 2.2394757 , 1.713712  ,-0.05896076, 4.192996  , 3.8417945 ,-1.4612933 , 2.6798608 , 1.8344553 ,-2.7644687 , 3.4822197 , 1.2855778 ,-0.9130815 , 2.66463   
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/3.txt
new file mode 100644
index 000000000..b3eb330e8
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/3.txt
@@ -0,0 +1 @@
+-1.4890275 ,-1.4528996 ,-2.8753834 ,-2.1400492 ,-0.05646766,-2.416613  ,-1.0417156 ,-3.1061885 , 4.8631783 ,-2.29921   ,-2.4092016 , 4.5070224 , 4.3909883 ,-1.765903  , 2.0950131 , 0.7523961 , 2.3408594 ,-0.34509352,-4.7058167 ,-4.941943  ,-2.2534447 ,-0.46839336,-1.2519743 , 1.6062143 , 4.3456235 ,-4.428032  ,-2.0671709 , 0.8194458 ,-2.7333214 , 4.9490767 ,-4.497053  , 2.786572  
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/4.txt
new file mode 100644
index 000000000..8070e7337
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/channel/int16/4.txt
@@ -0,0 +1 @@
+-1.8261101e+00, 4.3587184e+00,-2.2425966e+00, 3.2468474e+00,-4.8007107e+00, 1.8991641e+00, 2.9119995e+00,-4.8224683e+00, 1.3924009e+00, 2.0646741e+00,-3.7295690e-03, 2.2643164e-01,-1.5079597e+00, 3.5466003e+00,-4.3877802e+00, 4.6155982e+00, 1.4900422e+00,-4.9514108e+00, 3.7944238e+00,-3.1528413e+00,-4.6058831e+00, 2.6000957e+00,-1.2902383e+00, 3.6344111e+00, 3.8714981e+00,-3.8451505e-01,-4.9935651e-01,-2.1561024e+00,-1.9556242e-01, 4.7684064e+00, 8.2646644e-01, 3.4120755e+00
diff --git a/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py b/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py
index b7709812c..bdf86fe29 100755
--- a/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py
+++ b/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py
@@ -39,8 +39,9 @@ for i in range(num_data):
 
     for j in range(len(input_details)):
         input_detail = input_details[j]
-        input_data = np.array(
-            np.random.random_sample(input_detail["shape"]), input_detail["dtype"])
+        # Generate random input [-5, 5)
+        input_data = np.array(10 * np.random.random_sample(input_detail["shape"]) - 5,
+                              input_detail["dtype"])
         sample.create_dataset(str(j), data=input_data)
 
 h5_file.close()
diff --git a/compiler/record-minmax/src/HDF5Importer.h b/compiler/record-minmax/src/HDF5Importer.h
index cf6526685..9e98c7752 100644
--- a/compiler/record-minmax/src/HDF5Importer.h
+++ b/compiler/record-minmax/src/HDF5Importer.h
@@ -21,6 +21,8 @@
 
 #include <H5Cpp.h>
 
+#include <stdexcept>
+
 using Shape = luci_interpreter::Shape;
 using DataType = luci_interpreter::DataType;
 
@@ -39,9 +41,12 @@ namespace record_minmax
 class HDF5Importer
 {
 public:
-  explicit HDF5Importer(const std::string &path) : _file{path, H5F_ACC_RDONLY}
+  explicit HDF5Importer(const std::string &path)
   {
-    // Do nothing
+    if (_file.isHdf5(path) == false)
+      throw std::runtime_error("Given data file is not HDF5");
+
+    _file = H5::H5File(path, H5F_ACC_RDONLY);
   }
 
 public:
diff --git a/compiler/souschef/include/souschef/Dims.h b/compiler/souschef/include/souschef/Dims.h
index 52c64dd47..fabbf3f95 100644
--- a/compiler/souschef/include/souschef/Dims.h
+++ b/compiler/souschef/include/souschef/Dims.h
@@ -17,6 +17,7 @@
 #ifndef __SOUSCHEF_DIMS_H__
 #define __SOUSCHEF_DIMS_H__
 
+#include <cstdint>
 #include <functional>
 #include <numeric>
 #include <vector>
diff --git a/compiler/tf2tfliteV2/tf2tfliteV2.py b/compiler/tf2tfliteV2/tf2tfliteV2.py
index c51dabde0..3fb988102 100755
--- a/compiler/tf2tfliteV2/tf2tfliteV2.py
+++ b/compiler/tf2tfliteV2/tf2tfliteV2.py
@@ -88,8 +88,7 @@ def _get_parser():
         "-I",
         "--input_arrays",
         type=str,
-        help="Names of the input arrays, comma-separated.",
-        required=True)
+        help="Names of the input arrays, comma-separated.")
     parser.add_argument(
         "-s",
         "--input_shapes",
@@ -101,8 +100,7 @@ def _get_parser():
         "-O",
         "--output_arrays",
         type=str,
-        help="Names of the output arrays, comma-separated.",
-        required=True)
+        help="Names of the output arrays, comma-separated.")
 
     # Set default value
     parser.set_defaults(model_format="graph_def")
@@ -146,6 +144,10 @@ def _parse_array(arrays, type_fn=str):
 
 def _v1_convert(flags):
     if flags.model_format == "graph_def":
+        if not flags.input_arrays:
+            raise ValueError("--input_arrays must be provided")
+        if not flags.output_arrays:
+            raise ValueError("--output_arrays must be provided")
         input_shapes = None
         if flags.input_shapes:
             input_arrays = _parse_array(flags.input_arrays)
@@ -174,6 +176,10 @@ def _v1_convert(flags):
 
 def _v2_convert(flags):
     if flags.model_format == "graph_def":
+        if not flags.input_arrays:
+            raise ValueError("--input_arrays must be provided")
+        if not flags.output_arrays:
+            raise ValueError("--output_arrays must be provided")
         file_content = open(flags.input_path, 'rb').read()
         try:
             graph_def = tf.compat.v1.GraphDef()
diff --git a/compiler/tflchef/core/src/Convert.cpp b/compiler/tflchef/core/src/Convert.cpp
index dc8e31db0..9602faa96 100644
--- a/compiler/tflchef/core/src/Convert.cpp
+++ b/compiler/tflchef/core/src/Convert.cpp
@@ -45,6 +45,10 @@ tflite::ActivationFunctionType as_tflite_activation(const tflchef::Activation &v
       return tflite::ActivationFunctionType_RELU_N1_TO_1;
     case tflchef::RELU6:
       return tflite::ActivationFunctionType_RELU6;
+    case tflchef::TANH:
+      return tflite::ActivationFunctionType_TANH;
+    case tflchef::SIGN_BIT:
+      return tflite::ActivationFunctionType_SIGN_BIT;
     default:
       break;
   }
@@ -87,3 +91,72 @@ tflite::MirrorPadMode as_tflite_mirrorpadmode(const tflchef::MirrorPadMode &valu
 
   throw std::runtime_error{"Unknown mirrorpad mode"};
 }
+
+tflite::DimensionType as_tflite_dimensiontype(const tflchef::DimensionType &value)
+{
+  switch (value)
+  {
+    case tflchef::DimensionType::DENSE:
+      return tflite::DimensionType_DENSE;
+    case tflchef::DimensionType::SPARSE_CSR:
+      return tflite::DimensionType_SPARSE_CSR;
+    default:
+      break;
+  }
+
+  throw std::runtime_error("Unknown dimension type");
+}
+
+tflite::SparseIndexVector as_tflite_sparse_idx_vec_type(const tflchef::SparseIndexVecType &value)
+{
+  switch (value)
+  {
+    case tflchef::SparseIndexVecType::SparseIdxVecType_NONE:
+      return tflite::SparseIndexVector_NONE;
+    case tflchef::SparseIndexVecType::INT32VEC:
+      return tflite::SparseIndexVector_Int32Vector;
+    case tflchef::SparseIndexVecType::UINT16VEC:
+      return tflite::SparseIndexVector_Uint16Vector;
+    case tflchef::SparseIndexVecType::UINT8VEC:
+      return tflite::SparseIndexVector_Uint8Vector;
+    default:
+      break;
+  }
+
+  throw std::runtime_error("Unknown SparseIndexVector type");
+}
+
+flatbuffers::Offset<void>
+as_tflite_sparse_index_vec(flatbuffers::FlatBufferBuilder &fb,
+                           const ::tflchef::TensorSparsity_IndexVec &value)
+{
+  auto sparse_idx_type = value.type();
+
+  switch (sparse_idx_type)
+  {
+    case tflchef::SparseIndexVecType::SparseIdxVecType_NONE:
+      return flatbuffers::Offset<void>();
+    case tflchef::SparseIndexVecType::INT32VEC:
+    {
+      auto values_vec_int32 = std::vector<int32_t>{value.dim().begin(), value.dim().end()};
+      auto values_int32 = fb.CreateVector(values_vec_int32);
+      return tflite::CreateInt32Vector(fb, values_int32).Union();
+    }
+    case tflchef::SparseIndexVecType::UINT16VEC:
+    {
+      auto values_vec_uint16 = std::vector<uint16_t>{value.dim().begin(), value.dim().end()};
+      auto values_uint16 = fb.CreateVector(values_vec_uint16);
+      return tflite::CreateUint16Vector(fb, values_uint16).Union();
+    }
+    case tflchef::SparseIndexVecType::UINT8VEC:
+    {
+      auto values_vec_uint8 = std::vector<uint8_t>{value.dim().begin(), value.dim().end()};
+      auto values_uint8 = fb.CreateVector(values_vec_uint8);
+      return tflite::CreateUint8Vector(fb, values_uint8).Union();
+    }
+    default:
+      break;
+  }
+
+  throw std::runtime_error("Unknown SparseIndexVector type");
+}
diff --git a/compiler/tflchef/core/src/Convert.h b/compiler/tflchef/core/src/Convert.h
index b56e6ef69..45c93d229 100644
--- a/compiler/tflchef/core/src/Convert.h
+++ b/compiler/tflchef/core/src/Convert.h
@@ -28,5 +28,10 @@ tflite::Padding as_tflite_padding(const tflchef::Padding &value);
 tflite::ActivationFunctionType as_tflite_activation(const tflchef::Activation &value);
 tflite::TensorType as_tflite_tensortype(const tflchef::TensorType &value);
 tflite::MirrorPadMode as_tflite_mirrorpadmode(const tflchef::MirrorPadMode &value);
+tflite::DimensionType as_tflite_dimensiontype(const tflchef::DimensionType &value);
+tflite::SparseIndexVector as_tflite_sparse_idx_vec_type(const tflchef::SparseIndexVecType &value);
+flatbuffers::Offset<void>
+as_tflite_sparse_index_vec(flatbuffers::FlatBufferBuilder &fb,
+                           const ::tflchef::TensorSparsity_IndexVec &value);
 
 #endif // __CONVERT_H__
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index a4b435dfa..164011d68 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -376,6 +376,53 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
       quant_index = quant_builder.Finish();
     }
 
+    flatbuffers::Offset<tflite::SparsityParameters> sparsity_index;
+
+    if (operand.has_sparsity())
+    {
+      const auto &sparsity = operand.sparsity();
+
+      // Create traversal order
+      std::vector<int> traversal_order_vec{sparsity.traversal_order().dim().begin(),
+                                           sparsity.traversal_order().dim().end()};
+      auto traversal_order = flatbuffer_builder->CreateVector(traversal_order_vec);
+
+      // Create block map
+      std::vector<int> block_map_vec{sparsity.block_map().dim().begin(),
+                                     sparsity.block_map().dim().end()};
+      auto block_map = flatbuffer_builder->CreateVector(block_map_vec);
+
+      // Create dimension metadata
+      std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> dim_metadata_vec;
+      auto recipe_dim_metadata = sparsity.dim_metadata();
+      for (const auto &dm : recipe_dim_metadata)
+      {
+        // Create array segments
+        auto tflite_array_segments =
+            as_tflite_sparse_index_vec(*flatbuffer_builder, dm.array_segments());
+
+        // Create array indices
+        auto tflite_array_indices =
+            as_tflite_sparse_index_vec(*flatbuffer_builder, dm.array_indices());
+
+        auto tflite_dim_metadata_builder = tflite::DimensionMetadataBuilder{*flatbuffer_builder};
+        tflite_dim_metadata_builder.add_format(as_tflite_dimensiontype(dm.format()));
+        tflite_dim_metadata_builder.add_dense_size(dm.dense_size());
+        tflite_dim_metadata_builder.add_array_segments(tflite_array_segments);
+        tflite_dim_metadata_builder.add_array_segments_type(
+            as_tflite_sparse_idx_vec_type(dm.array_segments().type()));
+        tflite_dim_metadata_builder.add_array_indices(tflite_array_indices);
+        tflite_dim_metadata_builder.add_array_indices_type(
+            as_tflite_sparse_idx_vec_type(dm.array_indices().type()));
+        auto tflite_dim_metadata = tflite_dim_metadata_builder.Finish();
+        dim_metadata_vec.emplace_back(tflite_dim_metadata);
+      }
+      auto dim_metadata = flatbuffer_builder->CreateVector(dim_metadata_vec);
+
+      sparsity_index = tflite::CreateSparsityParameters(*flatbuffer_builder, traversal_order,
+                                                        block_map, dim_metadata);
+    }
+
     // Create Tensor
     tflite::TensorBuilder tensor_builder{*flatbuffer_builder};
 
@@ -383,8 +430,10 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
     tensor_builder.add_type(as_tflite_tensortype(operand.type()));
     tensor_builder.add_buffer(buffer_index);
     tensor_builder.add_name(name);
+    tensor_builder.add_is_variable(operand.is_variable());
     if (operand.has_quant())
       tensor_builder.add_quantization(quant_index);
+    tensor_builder.add_sparsity(sparsity_index);
 
     // Append!
     tensor_vec.emplace_back(tensor_builder.Finish());
diff --git a/runtime/libs/ndarray/include/ndarray/Common.h b/compiler/tflchef/core/src/Op/Dequantize.cpp
index aa0cc6fe2..761d7f99e 100644
--- a/runtime/libs/ndarray/include/ndarray/Common.h
+++ b/compiler/tflchef/core/src/Op/Dequantize.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,9 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef _NDARRAY_COMMON_H_
-#define _NDARRAY_COMMON_H_
+#include "Dequantize.h"
 
-#define NDARRAY_MAX_DIMENSION_COUNT 8
+flatbuffers::Offset<void> DequantizeChef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  return flatbuffers::Offset<void>();
+}
 
-#endif //_NDARRAY_COMMON_H_
+std::unique_ptr<OpChef> DequantizeChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new DequantizeChef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/Dequantize.h b/compiler/tflchef/core/src/Op/Dequantize.h
new file mode 100644
index 000000000..82580560d
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/Dequantize.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_DEQUANTIZE_H__
+#define __OP_DEQUANTIZE_H__
+
+#include "OpChef.h"
+
+class DequantizeChef final : public OpChef
+{
+public:
+  explicit DequantizeChef(const tflchef::Operation *operation) : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override { return tflite::BuiltinOperator_DEQUANTIZE; }
+
+  tflite::BuiltinOptions type(void) const override { return tflite::BuiltinOptions_NONE; }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct DequantizeChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_DEQUANTIZE_H__
diff --git a/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..ceabfc13c
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "UnidirectionalSequenceLSTM.h"
+#include "Convert.h"
+
+#include <cassert>
+
+flatbuffers::Offset<void>
+UnidirectionalSequenceLSTMChef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  auto &operation = (*_operation);
+
+  assert(operation.has_unidirectional_sequence_lstm_options());
+
+  tflite::UnidirectionalSequenceLSTMOptionsBuilder options_builder(fbb);
+  options_builder.add_fused_activation_function(
+      as_tflite_activation(operation.unidirectional_sequence_lstm_options().activation()));
+  options_builder.add_cell_clip(operation.unidirectional_sequence_lstm_options().cell_clip());
+  options_builder.add_proj_clip(operation.unidirectional_sequence_lstm_options().proj_clip());
+  options_builder.add_time_major(operation.unidirectional_sequence_lstm_options().time_major());
+  options_builder.add_asymmetric_quantize_inputs(
+      operation.unidirectional_sequence_lstm_options().asymmetric_quantize_inputs());
+
+  return options_builder.Finish().Union();
+}
+
+std::unique_ptr<OpChef>
+UnidirectionalSequenceLSTMChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new UnidirectionalSequenceLSTMChef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..6811ad378
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_UNIDIRECTIONALSEQUENCELSTM_H__
+#define __OP_UNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "OpChef.h"
+
+class UnidirectionalSequenceLSTMChef final : public OpChef
+{
+public:
+  explicit UnidirectionalSequenceLSTMChef(const tflchef::Operation *operation)
+      : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override
+  {
+    return tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM;
+  }
+
+  tflite::BuiltinOptions type(void) const override
+  {
+    return tflite::BuiltinOptions_UnidirectionalSequenceLSTMOptions;
+  }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct UnidirectionalSequenceLSTMChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_UNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compiler/tflchef/core/src/OpChef.def b/compiler/tflchef/core/src/OpChef.def
index 6b242e811..f2755fc29 100644
--- a/compiler/tflchef/core/src/OpChef.def
+++ b/compiler/tflchef/core/src/OpChef.def
@@ -19,6 +19,7 @@ OP_CHEF(Conv2D, Conv2DChefFactory)
 OP_CHEF(Cos, CosChefFactory)
 OP_CHEF(DepthToSpace, DepthToSpaceChefFactory)
 OP_CHEF(DepthwiseConv2D, DepthwiseConv2DChefFactory)
+OP_CHEF(Dequantize, DequantizeChefFactory)
 OP_CHEF(Div, DivChefFactory)
 OP_CHEF(ELU, ELUChefFactory)
 OP_CHEF(Equal, EqualChefFactory)
@@ -105,6 +106,7 @@ OP_CHEF(Tile, TileChefFactory)
 OP_CHEF(TopKV2, TopKV2ChefFactory)
 OP_CHEF(Transpose, TransposeChefFactory)
 OP_CHEF(TransposeConv, TransposeConvChefFactory)
+OP_CHEF(UnidirectionalSequenceLSTM, UnidirectionalSequenceLSTMChefFactory)
 OP_CHEF(Unique, UniqueChefFactory)
 OP_CHEF(Unpack, UnpackChefFactory)
 OP_CHEF(Where, WhereChefFactory)
diff --git a/compiler/tflchef/core/src/OpChefs.h b/compiler/tflchef/core/src/OpChefs.h
index 7637b1c69..36b9bdb76 100644
--- a/compiler/tflchef/core/src/OpChefs.h
+++ b/compiler/tflchef/core/src/OpChefs.h
@@ -32,6 +32,7 @@
 #include "Op/Cos.h"
 #include "Op/DepthToSpace.h"
 #include "Op/DepthwiseConv2D.h"
+#include "Op/Dequantize.h"
 #include "Op/Div.h"
 #include "Op/ELU.h"
 #include "Op/Equal.h"
@@ -118,6 +119,7 @@
 #include "Op/TopKV2.h"
 #include "Op/Transpose.h"
 #include "Op/TransposeConv.h"
+#include "Op/UnidirectionalSequenceLSTM.h"
 #include "Op/Unique.h"
 #include "Op/Unpack.h"
 #include "Op/Where.h"
diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
index 9909d517a..2efb54c39 100644
--- a/compiler/tflchef/proto/tflchef.proto
+++ b/compiler/tflchef/proto/tflchef.proto
@@ -21,6 +21,18 @@ enum TensorType {
   BOOL = 6;
 }
 
+enum DimensionType {
+  DENSE = 0;
+  SPARSE_CSR = 1;
+}
+
+enum SparseIndexVecType {
+  SparseIdxVecType_NONE = 0;
+  INT32VEC = 1;
+  UINT16VEC = 2;
+  UINT8VEC = 3;
+}
+
 message TensorShape {
   repeated uint32 dim = 3;
 }
@@ -38,12 +50,37 @@ message TensorQuantization {
   optional int32 quantized_dimension = 5 [default = 0];
 }
 
+message TensorSparsity {
+  message TraversalOrder {
+    repeated int32 dim = 1;
+  }
+  message BlockMap {
+    repeated int32 dim = 1;
+  }
+  message IndexVec {
+    repeated int32 dim = 1;
+    optional SparseIndexVecType type = 2;
+  }
+  message DimMetaData {
+    optional DimensionType format = 1;
+    optional int32 dense_size = 2;
+    optional IndexVec array_segments = 3;
+    optional IndexVec array_indices = 4;
+  }
+
+  optional TraversalOrder traversal_order = 1;
+  optional BlockMap block_map = 2;
+  repeated DimMetaData dim_metadata = 3;
+}
+
 message Operand {
   optional string name = 1;
   optional TensorType type = 2;
   optional TensorShape shape = 3;
   optional TensorFiller filler = 4;
   optional TensorQuantization quant = 5;
+  optional TensorSparsity sparsity = 6;
+  optional bool is_variable = 7 [default = false];
 }
 
 // This enum value corresponds to Padding in TensorFlow Lite schema
@@ -58,6 +95,8 @@ enum Activation {
   RELU = 1;
   RELU_N1_TO_1 = 2;
   RELU6 = 3;
+  TANH = 4;
+  SIGN_BIT = 5;
 }
 
 // This enum value corresponds to MirrorPadMode in TensorFlow Lite schema
@@ -419,6 +458,14 @@ message SegmentSumOptions {
   // NONE
 }
 
+message UnidirectionalSequenceLSTMOptions {
+  optional Activation activation = 1 [default = NONE];
+  optional float cell_clip = 2 [default = 0.0];
+  optional float proj_clip = 3 [default = 0.0];
+  optional bool time_major = 4 [default = false];
+  optional bool asymmetric_quantize_inputs = 5 [default = false];
+}
+
 message UniqueOptions {
   optional TensorType idx_out_type = 1 [default = INT32];
 }
@@ -443,6 +490,10 @@ message MatrixSetDiagOptions {
   // NONE
 }
 
+message DequantizeOptions {
+  // NONE
+}
+
 message Operation {
   optional string type = 1;
   repeated string input = 2;
@@ -518,7 +569,7 @@ message Operation {
   // SequenceRNNOptions 166
   optional TopKV2Options topk_v2_options = 167;
   optional LogSoftmaxOptions log_softmax_options = 168;
-  // DequantizeOptions 169
+  optional DequantizeOptions dequantize_options = 169;
   optional NegOptions neg_options = 170;
   optional PadV2Options padv2_options = 171;
   optional LessEqualOptions lessequal_options = 172;
@@ -530,7 +581,7 @@ message Operation {
   // FakeQuantOptions 178
   // BidirectionalSequenceLSTMOptions 179
   // BidirectionalSequenceRNNOptions 180
-  // UnidirectionalSequenceLSTMOptions 181
+  optional UnidirectionalSequenceLSTMOptions unidirectional_sequence_lstm_options = 181;
   optional RangeOptions range_options = 182;
   optional ResizeNearestNeighborOptions resize_nearest_neighbor_options = 183;
   optional LeakyReluOptions leaky_relu_options = 184;
diff --git a/compiler/tflchef/tflite/src/Convert.cpp b/compiler/tflchef/tflite/src/Convert.cpp
index 3cc1c9238..29276ff94 100644
--- a/compiler/tflchef/tflite/src/Convert.cpp
+++ b/compiler/tflchef/tflite/src/Convert.cpp
@@ -55,9 +55,10 @@ tflchef::Activation as_tflchef_activation(const tflite::ActivationFunctionType t
       return tflchef::RELU_N1_TO_1;
     case tflite::ActivationFunctionType_RELU6:
       return tflchef::RELU6;
-    // TODO handle other types
-    // ActivationFunctionType_TANH
-    // ActivationFunctionType_SIGN_BIT
+    case tflite::ActivationFunctionType_TANH:
+      return tflchef::TANH;
+    case tflite::ActivationFunctionType_SIGN_BIT:
+      return tflchef::SIGN_BIT;
     default:
       throw std::runtime_error{"unsupported activation type"};
   }
@@ -89,4 +90,34 @@ tflchef::MirrorPadMode as_tflchef_mirrorpadmode(const tflite::MirrorPadMode mode
   }
 }
 
+tflchef::DimensionType as_tflchef_sparse_dim_type(const tflite::DimensionType type)
+{
+  switch (type)
+  {
+    case tflite::DimensionType_DENSE:
+      return tflchef::DimensionType::DENSE;
+    case tflite::DimensionType_SPARSE_CSR:
+      return tflchef::DimensionType::SPARSE_CSR;
+    default:
+      throw std::runtime_error("unsupported sparse dimension type");
+  }
+}
+
+tflchef::SparseIndexVecType as_tflchef_sparse_idx_vec_type(const tflite::SparseIndexVector type)
+{
+  switch (type)
+  {
+    case tflite::SparseIndexVector_NONE:
+      return tflchef::SparseIndexVecType::SparseIdxVecType_NONE;
+    case tflite::SparseIndexVector_Int32Vector:
+      return tflchef::SparseIndexVecType::INT32VEC;
+    case tflite::SparseIndexVector_Uint16Vector:
+      return tflchef::SparseIndexVecType::UINT16VEC;
+    case tflite::SparseIndexVector_Uint8Vector:
+      return tflchef::SparseIndexVecType::UINT8VEC;
+    default:
+      throw std::runtime_error("unsupported sparse index vector type");
+  }
+}
+
 } // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Convert.h b/compiler/tflchef/tflite/src/Convert.h
index 770bffa4d..cf0c61550 100644
--- a/compiler/tflchef/tflite/src/Convert.h
+++ b/compiler/tflchef/tflite/src/Convert.h
@@ -28,6 +28,8 @@ tflchef::TensorType as_tflchef_type(const tflite::TensorType type);
 tflchef::Activation as_tflchef_activation(const tflite::ActivationFunctionType type);
 tflchef::Padding as_tflchef_padding(const tflite::Padding padding);
 tflchef::MirrorPadMode as_tflchef_mirrorpadmode(const tflite::MirrorPadMode mode);
+tflchef::DimensionType as_tflchef_sparse_dim_type(const tflite::DimensionType type);
+tflchef::SparseIndexVecType as_tflchef_sparse_idx_vec_type(const tflite::SparseIndexVector type);
 
 /**
  * @brief extract buffer data to std::vector<DT>
diff --git a/compiler/tflchef/tflite/src/Op/Dequantize.cpp b/compiler/tflchef/tflite/src/Op/Dequantize.cpp
new file mode 100644
index 000000000..436a0db19
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/Dequantize.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dequantize.h"
+
+#include "Convert.h"
+
+namespace tflchef
+{
+
+void TFliteOpDequantize::filler(const tflite::Operator *op, TFliteImport *import,
+                                tflchef::ModelRecipe *model_recipe) const
+{
+  // Nothing to do with filler
+}
+
+tflchef::Operation *TFliteOpDequantize::build(const tflite::Operator *, TFliteImport *import,
+                                              tflchef::ModelRecipe *model_recipe) const
+{
+  auto operation = model_recipe->add_operation();
+
+  operation->set_type("Dequantize");
+
+  return operation;
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Op/Dequantize.h b/compiler/tflchef/tflite/src/Op/Dequantize.h
new file mode 100644
index 000000000..df1c7bbdb
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/Dequantize.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TFLITE_OP_DEQUANTIZE_H__
+#define __TFLITE_OP_DEQUANTIZE_H__
+
+#include "TFliteOpChef.h"
+
+namespace tflchef
+{
+
+/**
+ * @brief tflchef operator builder for Dequantize
+ */
+class TFliteOpDequantize : public TFliteOpChef
+{
+public:
+  void filler(const tflite::Operator *op, TFliteImport *import,
+              tflchef::ModelRecipe *model_recipe) const override;
+  tflchef::Operation *build(const tflite::Operator *op, TFliteImport *import,
+                            tflchef::ModelRecipe *model_recipe) const override;
+};
+
+} // namespace tflchef
+
+#endif // __TFLITE_OP_DEQUANTIZE_H__
diff --git a/compiler/tflchef/tflite/src/Op/FullyConnected.cpp b/compiler/tflchef/tflite/src/Op/FullyConnected.cpp
index 4291c844b..1f6e73aa6 100644
--- a/compiler/tflchef/tflite/src/Op/FullyConnected.cpp
+++ b/compiler/tflchef/tflite/src/Op/FullyConnected.cpp
@@ -17,6 +17,7 @@
 #include "FullyConnected.h"
 
 #include "Convert.h"
+#include "FillerHelper.h"
 
 namespace tflchef
 {
@@ -24,7 +25,14 @@ namespace tflchef
 void TFliteOpFullyConnected::filler(const tflite::Operator *op, TFliteImport *import,
                                     tflchef::ModelRecipe *model_recipe) const
 {
-  // Nothing to do with filler
+  const auto &inputs = *op->inputs();
+
+  for (uint32_t idx = 1; idx < inputs.size(); idx++)
+  {
+    // optional input tensor idx has minus value.
+    if (inputs[idx] >= 0)
+      fill_tensor_to_import(inputs[idx], import);
+  }
 }
 
 tflchef::Operation *TFliteOpFullyConnected::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp b/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..c2c79285b
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "UnidirectionalSequenceLSTM.h"
+
+#include "Convert.h"
+#include "FillerHelper.h"
+
+namespace tflchef
+{
+
+void TFliteOpUnidirectionalSequenceLSTM::filler(const tflite::Operator *op, TFliteImport *import,
+                                                tflchef::ModelRecipe *model_recipe) const
+{
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+  assert(inputs.size() == 24);
+
+  for (int32_t i = 0; i < inputs.size(); i++)
+  {
+    // Except for Input 0, 17 and 18.
+    // Each Input mean Input[0](=Input Tensor), Input[17](=OutputState Tensor) and
+    // Input[18](=CellState Tensor).
+    // This could be updated from previous input or User Given data, so This could not be Const
+    if (i == 0 || i == 17 || i == 18)
+      continue;
+    if (inputs[i] != -1)
+      fill_tensor_to_import(inputs[i], import);
+  }
+}
+
+tflchef::Operation *
+TFliteOpUnidirectionalSequenceLSTM::build(const tflite::Operator *op, TFliteImport *import,
+                                          tflchef::ModelRecipe *model_recipe) const
+{
+  auto op_params = op->builtin_options_as_UnidirectionalSequenceLSTMOptions();
+  assert(op_params != nullptr);
+
+  auto operation = model_recipe->add_operation();
+
+  operation->set_type("UnidirectionalSequenceLSTM");
+
+  auto op_options = operation->mutable_unidirectional_sequence_lstm_options();
+
+  op_options->set_activation(as_tflchef_activation(op_params->fused_activation_function()));
+  op_options->set_cell_clip(op_params->cell_clip());
+  op_options->set_proj_clip(op_params->proj_clip());
+  op_options->set_time_major(op_params->time_major());
+  op_options->set_asymmetric_quantize_inputs(op_params->asymmetric_quantize_inputs());
+
+  return operation;
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.h b/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..cc4e5fb0f
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TFLITE_OP_UNIDIRECTIONALSEQUENCELSTM_H__
+#define __TFLITE_OP_UNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "TFliteOpChef.h"
+
+namespace tflchef
+{
+
+/**
+ * @brief tflchef operator builder for UnidirectionalSequenceLSTM
+ */
+class TFliteOpUnidirectionalSequenceLSTM : public TFliteOpChef
+{
+public:
+  void filler(const tflite::Operator *op, TFliteImport *import,
+              tflchef::ModelRecipe *model_recipe) const override;
+  tflchef::Operation *build(const tflite::Operator *op, TFliteImport *import,
+                            tflchef::ModelRecipe *model_recipe) const override;
+};
+
+} // namespace tflchef
+
+#endif // __TFLITE_OP_UNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp
index 088961c1c..32ac9c3f3 100644
--- a/compiler/tflchef/tflite/src/RecipeChef.cpp
+++ b/compiler/tflchef/tflite/src/RecipeChef.cpp
@@ -110,6 +110,7 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
 
     operand->set_name(tensor_name(tensor));
     operand->set_type(as_tflchef_type(tensor->type()));
+    operand->set_is_variable(tensor->is_variable());
 
     if (tensor->shape())
     {
@@ -187,6 +188,89 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
       tflchef::TensorQuantization *chef_quant = operand->mutable_quant();
       chef_quant->set_quantized_dimension(quant->quantized_dimension());
     }
+
+    auto sparsity = tensor->sparsity();
+    if (sparsity != nullptr)
+    {
+      tflchef::TensorSparsity *chef_sparsity = operand->mutable_sparsity();
+      // traversal_order
+      auto chef_traversal_order = chef_sparsity->mutable_traversal_order();
+      for (const auto &to : *(sparsity->traversal_order()))
+      {
+        chef_traversal_order->add_dim(to);
+      }
+      // block_map
+      auto chef_block_map = chef_sparsity->mutable_block_map();
+      for (const auto &bm : *(sparsity->block_map()))
+      {
+        chef_block_map->add_dim(bm);
+      }
+      // dim_metadata
+      for (const auto &dm : *(sparsity->dim_metadata()))
+      {
+        auto chef_dm = chef_sparsity->add_dim_metadata();
+        // format
+        chef_dm->set_format(as_tflchef_sparse_dim_type(dm->format()));
+        // dense_size
+        chef_dm->set_dense_size(dm->dense_size());
+        // array_segments
+        auto chef_array_segments = chef_dm->mutable_array_segments();
+        switch (dm->array_segments_type())
+        {
+          case tflite::SparseIndexVector_NONE:
+            // DO NOTHING
+            break;
+          case tflite::SparseIndexVector_Int32Vector:
+            for (const auto &as : *(dm->array_segments_as_Int32Vector()->values()))
+            {
+              chef_array_segments->add_dim(as);
+            }
+            break;
+          case tflite::SparseIndexVector_Uint16Vector:
+            for (const auto &as : *(dm->array_segments_as_Uint16Vector()->values()))
+            {
+              chef_array_segments->add_dim(as);
+            }
+            break;
+          case tflite::SparseIndexVector_Uint8Vector:
+            for (const auto &as : *(dm->array_segments_as_Uint8Vector()->values()))
+            {
+              chef_array_segments->add_dim(as);
+            }
+            break;
+          default:
+            throw std::runtime_error("unsupported sparse index vector type");
+        }
+        // array_indices
+        auto chef_array_indices = chef_dm->mutable_array_indices();
+        switch (dm->array_indices_type())
+        {
+          case tflite::SparseIndexVector_NONE:
+            // DO NOTHING
+            break;
+          case tflite::SparseIndexVector_Int32Vector:
+            for (const auto &as : *(dm->array_indices_as_Int32Vector()->values()))
+            {
+              chef_array_indices->add_dim(as);
+            }
+            break;
+          case tflite::SparseIndexVector_Uint16Vector:
+            for (const auto &as : *(dm->array_indices_as_Uint16Vector()->values()))
+            {
+              chef_array_indices->add_dim(as);
+            }
+            break;
+          case tflite::SparseIndexVector_Uint8Vector:
+            for (const auto &as : *(dm->array_indices_as_Uint8Vector()->values()))
+            {
+              chef_array_indices->add_dim(as);
+            }
+            break;
+          default:
+            throw std::runtime_error("unsupported sparse index vector type");
+        }
+      }
+    }
   }
 
   // add all operators
diff --git a/compiler/tflchef/tflite/src/TFliteOpChefs.h b/compiler/tflchef/tflite/src/TFliteOpChefs.h
index 36a010957..2e4d28051 100644
--- a/compiler/tflchef/tflite/src/TFliteOpChefs.h
+++ b/compiler/tflchef/tflite/src/TFliteOpChefs.h
@@ -33,6 +33,7 @@
 #include "Op/Cos.h"
 #include "Op/DepthToSpace.h"
 #include "Op/DepthwiseConv2D.h"
+#include "Op/Dequantize.h"
 #include "Op/Div.h"
 #include "Op/ELU.h"
 #include "Op/Equal.h"
@@ -118,6 +119,7 @@
 #include "Op/TopKV2.h"
 #include "Op/Transpose.h"
 #include "Op/TransposeConv.h"
+#include "Op/UnidirectionalSequenceLSTM.h"
 #include "Op/Unique.h"
 #include "Op/Unpack.h"
 #include "Op/Where.h"
diff --git a/compiler/tflchef/tflite/src/TFliteOpRegistry.h b/compiler/tflchef/tflite/src/TFliteOpRegistry.h
index a454e98b6..9cc630a97 100644
--- a/compiler/tflchef/tflite/src/TFliteOpRegistry.h
+++ b/compiler/tflchef/tflite/src/TFliteOpRegistry.h
@@ -70,6 +70,7 @@ private:
     REG_TFL_OP(COS, TFliteOpCos);
     REG_TFL_OP(DEPTH_TO_SPACE, TFliteOpDepthToSpace);
     REG_TFL_OP(DEPTHWISE_CONV_2D, TFliteOpDepthwiseConv2D);
+    REG_TFL_OP(DEQUANTIZE, TFliteOpDequantize);
     REG_TFL_OP(DIV, TFliteOpDiv);
     REG_TFL_OP(ELU, TFliteOpELU);
     REG_TFL_OP(EQUAL, TFliteOpEqual);
@@ -155,6 +156,7 @@ private:
     REG_TFL_OP(TOPK_V2, TFliteOpTopKV2);
     REG_TFL_OP(TRANSPOSE, TFliteOpTranspose);
     REG_TFL_OP(TRANSPOSE_CONV, TFliteOpTransposeConv);
+    REG_TFL_OP(UNIDIRECTIONAL_SEQUENCE_LSTM, TFliteOpUnidirectionalSequenceLSTM);
     REG_TFL_OP(UNIQUE, TFliteOpUnique);
     REG_TFL_OP(UNPACK, TFliteOpUnpack);
     REG_TFL_OP(WHERE, TFliteOpWhere);
diff --git a/compiler/tfldump/src/Dump.cpp b/compiler/tfldump/src/Dump.cpp
index e1562d42f..8c8178f93 100644
--- a/compiler/tfldump/src/Dump.cpp
+++ b/compiler/tfldump/src/Dump.cpp
@@ -73,10 +73,34 @@ std::ostream &operator<<(std::ostream &os, const std::vector<int32_t> &vect)
   return os;
 }
 
-template <typename T> void dump_fbvect(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
+template <typename T>
+void dump_fbvect(std::ostream &os, const flatbuffers::Vector<T> *fbvect, uint32_t size)
+{
+  for (uint32_t q = 0; q < size; q++)
+  {
+    if (q)
+      os << ", ";
+    os << fbvect->Get(q);
+  }
+}
+
+template <>
+void dump_fbvect(std::ostream &os, const flatbuffers::Vector<uint8_t> *fbvect, uint32_t size)
+{
+  assert(fbvect);
+  for (uint32_t q = 0; q < size; q++)
+  {
+    if (q)
+      os << ", ";
+    os << static_cast<uint32_t>(fbvect->Get(q));
+  }
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
 {
   if (fbvect == nullptr)
-    return;
+    return os;
 
   bool ellipsis = (fbvect->size() > 4);
   auto limit_size = ellipsis ? 4 : fbvect->size();
@@ -85,22 +109,14 @@ template <typename T> void dump_fbvect(std::ostream &os, const flatbuffers::Vect
   {
     os << "(" << fbvect->size() << ") ";
   }
-  for (uint32_t q = 0; q < limit_size; q++)
-  {
-    if (q)
-      os << ", ";
-    os << fbvect->Get(q);
-  }
+
+  dump_fbvect(os, fbvect, limit_size);
+
   if (ellipsis)
   {
     os << " ... ";
   }
-}
 
-template <typename T>
-std::ostream &operator<<(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
-{
-  dump_fbvect(os, fbvect);
   return os;
 }
 
@@ -169,8 +185,90 @@ void dump_sub_graph(std::ostream &os, tflread::Reader &reader)
         os << std::endl;
       }
     }
+
+    if (const auto &s_params = tensor->sparsity())
+    {
+      std::string strsparsity = "    Sparsity: ";
+      std::string strsindent(strsparsity.size(), ' ');
+      os << strsparsity;
+
+      if (s_params->traversal_order())
+      {
+        os << "traversal_order(" << s_params->traversal_order() << ") ";
+        os << std::endl << strsindent;
+      }
+      if (s_params->block_map())
+      {
+        os << "block_map(" << s_params->block_map() << ") ";
+        os << std::endl << strsindent;
+      }
+      if (const auto &dim_metadata = s_params->dim_metadata())
+      {
+        uint32_t idx = 0;
+        for (const auto &dm : *dim_metadata)
+        {
+          std::string strdm = "dim_metadata[" + std::to_string(idx++) + "]: ";
+          std::string strdm_indent = strsindent + std::string(strdm.size(), ' ');
+          os << strdm;
+
+          os << "format(" << tflite::EnumNameDimensionType(dm->format()) << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "dense_size(" << dm->dense_size() << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "array_segments_type("
+             << tflite::EnumNameSparseIndexVector(dm->array_segments_type()) << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "array_segments(";
+          switch (dm->array_segments_type())
+          {
+            case tflite::SparseIndexVector_NONE:
+              // DO NOTHING
+              break;
+            case tflite::SparseIndexVector_Int32Vector:
+              os << dm->array_segments_as_Int32Vector()->values();
+              break;
+            case tflite::SparseIndexVector_Uint16Vector:
+              os << dm->array_segments_as_Uint16Vector()->values();
+              break;
+            case tflite::SparseIndexVector_Uint8Vector:
+              os << dm->array_segments_as_Uint8Vector()->values();
+              break;
+            default:
+              throw std::runtime_error("Invalid SparseIndexVector type of array_segments");
+          }
+          os << ")" << std::endl << strdm_indent;
+
+          os << "array_indices_type(" << tflite::EnumNameSparseIndexVector(dm->array_indices_type())
+             << ") ";
+          os << std::endl << strdm_indent;
+
+          os << "array_indices(";
+          switch (dm->array_indices_type())
+          {
+            case tflite::SparseIndexVector_NONE:
+              // DO NOTHING
+              break;
+            case tflite::SparseIndexVector_Int32Vector:
+              os << dm->array_indices_as_Int32Vector()->values();
+              break;
+            case tflite::SparseIndexVector_Uint16Vector:
+              os << dm->array_indices_as_Uint16Vector()->values();
+              break;
+            case tflite::SparseIndexVector_Uint8Vector:
+              os << dm->array_indices_as_Uint8Vector()->values();
+              break;
+            default:
+              throw std::runtime_error("Invalid SparseIndexVector type of array_indices");
+          }
+          os << ")" << std::endl << strsindent;
+        }
+      }
+    }
+    os << std::endl;
   }
-  os << std::endl;
 
   // dump operators
   os << "Operators: O(subgraph index : operator index) OpCodeName " << std::endl;
diff --git a/compiler/tfldump/src/OpPrinter.cpp b/compiler/tfldump/src/OpPrinter.cpp
index 24b9264ff..5d279632c 100644
--- a/compiler/tfldump/src/OpPrinter.cpp
+++ b/compiler/tfldump/src/OpPrinter.cpp
@@ -592,6 +592,25 @@ public:
   }
 };
 
+class UnidirectionalSequenceLSTMPrinter : public OpPrinter
+{
+public:
+  void options(const tflite::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_UnidirectionalSequenceLSTMOptions())
+    {
+      os << "    ";
+      os << "Activation(" << EnumNameActivationFunctionType(params->fused_activation_function())
+         << ") ";
+      os << "cell_clip(" << params->cell_clip() << ") ";
+      os << "proj_clip(" << params->proj_clip() << ") ";
+      os << "time_major(" << params->time_major() << ") ";
+      os << "asymmetric_quantize_inputs(" << params->asymmetric_quantize_inputs() << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 class UniquePrinter : public OpPrinter
 {
 public:
@@ -659,6 +678,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_CONV_2D] = make_unique<Conv2DPrinter>();
   _op_map[tflite::BuiltinOperator_DEPTH_TO_SPACE] = make_unique<DepthToSpacePrinter>();
   _op_map[tflite::BuiltinOperator_DEPTHWISE_CONV_2D] = make_unique<DepthwiseConv2DPrinter>();
+  // There is no Option for DEQUANTIZE
   _op_map[tflite::BuiltinOperator_DIV] = make_unique<DivPrinter>();
   // There is no Option for FLOOR
   // There is no Option for FLOOR_MOD
@@ -713,6 +733,8 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
   _op_map[tflite::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
   // There is no Option for TOPK_V2
+  _op_map[tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM] =
+      make_unique<UnidirectionalSequenceLSTMPrinter>();
   _op_map[tflite::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
   _op_map[tflite::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
   _op_map[tflite::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions.h
index 680118618..56a16d4e0 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions.h
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions.h
@@ -104,6 +104,7 @@
 #include "BuildBuiltinOptions/TopKV2Options.h"
 #include "BuildBuiltinOptions/TransposeOptions.h"
 #include "BuildBuiltinOptions/TransposeConvOptions.h"
+#include "BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.h"
 #include "BuildBuiltinOptions/UniqueOptions.h"
 #include "BuildBuiltinOptions/UnpackOptions.h"
 #include "BuildBuiltinOptions/WhereOptions.h"
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp
new file mode 100644
index 000000000..64ceb5a74
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "UnidirectionalSequenceLSTMOptions.h"
+#include "DataLookup.h"
+
+#include <cassert>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::UnidirectionalSequenceLSTMOptions>
+build_circle_UnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &fb,
+                                               const tflite::Operator *op)
+{
+  auto tflite_builtin_options = op->builtin_options_as_UnidirectionalSequenceLSTMOptions();
+  circle::UnidirectionalSequenceLSTMOptionsBuilder builtin_options_builder{fb};
+  builtin_options_builder.add_fused_activation_function(
+      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+  builtin_options_builder.add_cell_clip(tflite_builtin_options->cell_clip());
+  builtin_options_builder.add_proj_clip(tflite_builtin_options->proj_clip());
+  builtin_options_builder.add_time_major(tflite_builtin_options->time_major());
+  builtin_options_builder.add_asymmetric_quantize_inputs(
+      tflite_builtin_options->asymmetric_quantize_inputs());
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.h
new file mode 100644
index 000000000..2be0efbc2
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BBO_UNIDIRECTIONALSEQUENCELSTM_OPTIONS_H__
+#define __BBO_UNIDIRECTIONALSEQUENCELSTM_OPTIONS_H__
+
+#include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::UnidirectionalSequenceLSTMOptions>
+build_circle_UnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &fb,
+                                               const tflite::Operator *op);
+
+} // namespace tflite2circle
+
+#endif // __BBO_UNIDIRECTIONALSEQUENCELSTM_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/CircleModel.cpp b/compiler/tflite2circle/src/CircleModel.cpp
index 14c44cb36..a95c37089 100644
--- a/compiler/tflite2circle/src/CircleModel.cpp
+++ b/compiler/tflite2circle/src/CircleModel.cpp
@@ -152,14 +152,14 @@ Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_fla
           // array_segments
           auto tflite_array_segments_type = it->array_segments_type();
           auto circle_array_segments =
-              get_circle_sparse_index_vector(*fb, it, tflite_array_segments_type);
+              get_circle_sparse_index_vector(*fb, it->array_segments(), tflite_array_segments_type);
           auto circle_array_segments_type =
               get_circle_sparse_index_vector_type(tflite_array_segments_type);
 
           // array_indices
           auto tflite_array_indices_type = it->array_indices_type();
           auto circle_array_indices =
-              get_circle_sparse_index_vector(*fb, it, tflite_array_indices_type);
+              get_circle_sparse_index_vector(*fb, it->array_indices(), tflite_array_indices_type);
           auto circle_array_indices_type =
               get_circle_sparse_index_vector_type(tflite_array_indices_type);
 
@@ -218,41 +218,44 @@ Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_fla
     std::vector<flatbuffers::Offset<circle::Operator>> operator_vec;
 
     auto tflite_operators = it_sg->operators();
-    for (auto it : *tflite_operators)
+    if (tflite_operators != nullptr)
     {
-      // inputs
-      std::vector<int32_t> input_vec{it->inputs()->begin(), it->inputs()->end()};
-      auto circle_inputs = fb->CreateVector(input_vec);
-      // outputs
-      std::vector<int32_t> output_vec{it->outputs()->begin(), it->outputs()->end()};
-      auto circle_outputs = fb->CreateVector(output_vec);
-      // builtin options
-      auto circle_builtin_options = get_circle_builtin_options(*fb, it);
-      auto circle_builtin_options_type = get_circle_builtin_options_type(it);
-      // custom options
-      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> circle_custom_options;
-      if (it->custom_options())
+      for (auto it : *tflite_operators)
       {
-        std::vector<uint8_t> custom_options_vec{it->custom_options()->begin(),
-                                                it->custom_options()->end()};
-        circle_custom_options = fb->CreateVector(custom_options_vec);
+        // inputs
+        std::vector<int32_t> input_vec{it->inputs()->begin(), it->inputs()->end()};
+        auto circle_inputs = fb->CreateVector(input_vec);
+        // outputs
+        std::vector<int32_t> output_vec{it->outputs()->begin(), it->outputs()->end()};
+        auto circle_outputs = fb->CreateVector(output_vec);
+        // builtin options
+        auto circle_builtin_options = get_circle_builtin_options(*fb, it);
+        auto circle_builtin_options_type = get_circle_builtin_options_type(it);
+        // custom options
+        flatbuffers::Offset<flatbuffers::Vector<uint8_t>> circle_custom_options;
+        if (it->custom_options())
+        {
+          std::vector<uint8_t> custom_options_vec{it->custom_options()->begin(),
+                                                  it->custom_options()->end()};
+          circle_custom_options = fb->CreateVector(custom_options_vec);
+        }
+        // custom options format
+        // TODO Make get_circle_custom_options_format
+        assert(it->custom_options_format() == tflite::CustomOptionsFormat_FLEXBUFFERS);
+        auto circle_custom_options_format = circle::CustomOptionsFormat_FLEXBUFFERS;
+
+        circle::OperatorBuilder operator_builder{*fb};
+        operator_builder.add_opcode_index(it->opcode_index());
+        operator_builder.add_inputs(circle_inputs);
+        operator_builder.add_outputs(circle_outputs);
+        operator_builder.add_builtin_options(circle_builtin_options);
+        operator_builder.add_builtin_options_type(circle_builtin_options_type);
+        operator_builder.add_custom_options(circle_custom_options);
+        operator_builder.add_custom_options_format(circle_custom_options_format);
+        // TODO mutating_variable_inputs
+        auto opeartor = operator_builder.Finish();
+        operator_vec.emplace_back(opeartor);
       }
-      // custom options format
-      // TODO Make get_circle_custom_options_format
-      assert(it->custom_options_format() == tflite::CustomOptionsFormat_FLEXBUFFERS);
-      auto circle_custom_options_format = circle::CustomOptionsFormat_FLEXBUFFERS;
-
-      circle::OperatorBuilder operator_builder{*fb};
-      operator_builder.add_opcode_index(it->opcode_index());
-      operator_builder.add_inputs(circle_inputs);
-      operator_builder.add_outputs(circle_outputs);
-      operator_builder.add_builtin_options(circle_builtin_options);
-      operator_builder.add_builtin_options_type(circle_builtin_options_type);
-      operator_builder.add_custom_options(circle_custom_options);
-      operator_builder.add_custom_options_format(circle_custom_options_format);
-      // TODO mutating_variable_inputs
-      auto opeartor = operator_builder.Finish();
-      operator_vec.emplace_back(opeartor);
     }
     auto circle_operators = fb->CreateVector(operator_vec);
 
diff --git a/compiler/tflite2circle/src/DataLookup.cpp b/compiler/tflite2circle/src/DataLookup.cpp
index 75504b062..f8dd75f4c 100644
--- a/compiler/tflite2circle/src/DataLookup.cpp
+++ b/compiler/tflite2circle/src/DataLookup.cpp
@@ -137,8 +137,7 @@ circle::DimensionType get_circle_dimension_type(tflite::DimensionType tfl_dim_ty
 }
 
 flatbuffers::Offset<void>
-get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
-                               const tflite::DimensionMetadata *dm,
+get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb, const void *v_array,
                                const tflite::SparseIndexVector &tfl_sparse_index_vector_type)
 {
   switch (tfl_sparse_index_vector_type)
@@ -147,9 +146,9 @@ get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
       return flatbuffers::Offset<void>();
     case tflite::SparseIndexVector_Int32Vector:
     {
+      const tflite::Int32Vector *i32_array = static_cast<const tflite::Int32Vector *>(v_array);
       auto values_vec_int32 =
-          std::vector<int32_t>{dm->array_segments_as_Int32Vector()->values()->begin(),
-                               dm->array_segments_as_Int32Vector()->values()->end()};
+          std::vector<int32_t>{i32_array->values()->begin(), i32_array->values()->end()};
       auto values_int32 = fb.CreateVector(values_vec_int32);
       circle::Int32VectorBuilder int32_vector_builder{fb};
       int32_vector_builder.add_values(values_int32);
@@ -157,9 +156,9 @@ get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
     }
     case tflite::SparseIndexVector_Uint16Vector:
     {
+      const tflite::Uint16Vector *u16_array = static_cast<const tflite::Uint16Vector *>(v_array);
       auto values_vec_uint16 =
-          std::vector<uint16_t>{dm->array_segments_as_Uint16Vector()->values()->begin(),
-                                dm->array_segments_as_Uint16Vector()->values()->end()};
+          std::vector<uint16_t>{u16_array->values()->begin(), u16_array->values()->end()};
       auto values_uint16 = fb.CreateVector(values_vec_uint16);
       circle::Uint16VectorBuilder uint16_vector_builder{fb};
       uint16_vector_builder.add_values(values_uint16);
@@ -167,9 +166,9 @@ get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
     }
     case tflite::SparseIndexVector_Uint8Vector:
     {
+      const tflite::Uint8Vector *u8_array = static_cast<const tflite::Uint8Vector *>(v_array);
       auto values_vec_uint8 =
-          std::vector<uint8_t>{dm->array_segments_as_Uint8Vector()->values()->begin(),
-                               dm->array_segments_as_Uint8Vector()->values()->end()};
+          std::vector<uint8_t>{u8_array->values()->begin(), u8_array->values()->end()};
       auto values_uint8 = fb.CreateVector(values_vec_uint8);
       circle::Uint8VectorBuilder uint8_vector_builder{fb};
       uint8_vector_builder.add_values(values_uint8);
diff --git a/compiler/tflite2circle/src/DataLookup.h b/compiler/tflite2circle/src/DataLookup.h
index 26ad74666..58a357703 100644
--- a/compiler/tflite2circle/src/DataLookup.h
+++ b/compiler/tflite2circle/src/DataLookup.h
@@ -85,8 +85,7 @@ circle::DimensionType get_circle_dimension_type(tflite::DimensionType tfl_dim_ty
  * @brief Returns circle SparseIndexVector according to tflite.
 */
 flatbuffers::Offset<void>
-get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb,
-                               const tflite::DimensionMetadata *dm,
+get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb, const void *values,
                                const tflite::SparseIndexVector &tfl_sparse_index_vector_type);
 
 /**
diff --git a/compiler/tflite2circle/src/TFLBuiltinOptions.lst b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
index 22b59863b..4bc101f8e 100644
--- a/compiler/tflite2circle/src/TFLBuiltinOptions.lst
+++ b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
@@ -76,7 +76,7 @@ TFL_BUILTIN_OPTIONS(ZerosLikeOptions)
 TFL_BUILTIN_OPTIONS(FillOptions)
 //TFL_BUILTIN_OPTIONS(BidirectionalSequenceLSTMOptions)
 //TFL_BUILTIN_OPTIONS(BidirectionalSequenceRNNOptions)
-//TFL_BUILTIN_OPTIONS(UnidirectionalSequenceLSTMOptions)
+TFL_BUILTIN_OPTIONS(UnidirectionalSequenceLSTMOptions)
 TFL_BUILTIN_OPTIONS(FloorModOptions)
 TFL_BUILTIN_OPTIONS(RangeOptions)
 TFL_BUILTIN_OPTIONS(ResizeNearestNeighborOptions)
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
index be4398996..ffca275db 100644
--- a/compiler/vconone/CMakeLists.txt
+++ b/compiler/vconone/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT VCONONE_VERSION)
-  set(VCONONE_VERSION 0x0000000000090001)
+  set(VCONONE_VERSION 0x00000000000a0001)
   # NOTE order is [build patch minor major]
   # if VCONONE_VERSION is set with -D option, it will be cached
   # you may have to remove cache file if you remove -D option
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
new file mode 100644
index 000000000..a0aa0560b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
+#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ *
+ * @note The default data type for an uninitialized output tensor is
+ *       signed 32-bit integer (S32). It is the user's responsibility to check
+ *       that the results do not overflow because the indices are computed
+ *       in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerKernelEx : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLArgMinMaxLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLArgMinMaxLayerKernelEx() = default;
+
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
+   * @param[in]  prev_output Destination tensor of the previous iterations of @ref
+   * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32
+   *                         Has to be nullptr for the first iteration
+   * @param[out] output      Destination tensor. Data types supported: U32/S32
+   *                         Output will have the same number of dimensions as input.
+   * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
+   * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output,
+                 unsigned int axis, ReductionOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArgMinMaxLayerKernelEx.
+   *
+   * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
+   * @param[in] prev_output Destination tensor info of the previous iterations. Data types
+   * supported: U32/S32
+   *                        Has to be nullptr for the first iteration
+   * @param[in] output      Destination tensor info. Data types supported: U32/S32
+   *                        Output will have the same number of dimensions as input.
+   * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
+   * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output,
+                         const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_prev_output;
+  ICLTensor *_output;
+  unsigned int _reduction_axis;
+  ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
new file mode 100644
index 000000000..ed668fd9c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file      CLCastBoolKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLCastBoolKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
+#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class for the kernel converting boolean type
+ */
+class CLCastBoolKernel : public ICLSimple3DKernel
+{
+public:
+  /**
+   * @brief Initialise the kernel's input and output.
+   * @param[in]  input  Input tensor. Data types supported: U8
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLCastBoolKernel
+   *
+   * @param[in] input  Source tensor info. Data types supported: U8.
+   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
new file mode 100644
index 000000000..a512057b9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
+#define __ARM_COMPUTE_CLONEHOTKERNEL_H__
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the kernel to perform one-hot encoding*/
+class CLOneHotKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLOneHotKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHotKernel(const CLOneHotKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHotKernel &operator=(const CLOneHotKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLOneHotKernel(CLOneHotKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLOneHotKernel &operator=(CLOneHotKernel &&) = default;
+  /** Default destructor */
+  ~CLOneHotKernel() = default;
+  /** Initialise the kernel's inputs and output
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
+                 ICLTensor *output, int depth, int axis = -1);
+  /** Initialise the kernel's inputs and output already initialized to off_value
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth,
+                 int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLOneHotKernel
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                         const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                         int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLOneHotKernel without off_value
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                         const ITensorInfo *output, int depth, int axis = -1);
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  /** Initialise the kernel's inputs and outputs internally
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+                        int depth, int axis);
+
+private:
+  const ICLTensor *_indices;   /**< Indices tensor */
+  const ICLTensor *_on_value;  /**< On value tensor */
+  const ICLTensor *_off_value; /**< Off value tensor */
+  ICLTensor *_output;          /**< Destination tensor */
+  bool _is_off_value_memset;   /**< Whether off_value is zero */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h
deleted file mode 100644
index 6e8bdc1c2..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPONEHOTERNEL_H__
-#define __ARM_COMPUTE_CPPONEHOTERNEL_H__
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** CPP kernel to perform tensor OneHot operation. */
-class CPPOneHotKernelEx : public ICPPKernel
-{
-public:
-  const char *name() const override { return "CPPOneHotKernelEx"; }
-  /** Default constructor */
-  CPPOneHotKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CPPOneHotKernelEx(const CPPOneHotKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CPPOneHotKernelEx &operator=(const CPPOneHotKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CPPOneHotKernelEx(CPPOneHotKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CPPOneHotKernelEx &operator=(CPPOneHotKernelEx &&) = default;
-  /** Default destructor */
-  ~CPPOneHotKernelEx() = default;
-
-  /** Set the input and output of the kernel.
-   *
-   * @param[in]  indices     A tensor for indices. Data types supported: S32
-   * @param[in]  depth       A tensor for depth. Data types supported: S32
-   * @param[in]  on_value    A tensor for on_value. Data types supported: F32
-   * @param[in]  off_value   A tensor for off_value. Data types supported: F32*
-   * @param[out] output      A tensor for computed value of one hot operator
-   * @param[in]  axis        An int value for axis
-   */
-  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                 const ITensor *off_value, ITensor *output, const int axis);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CPPOneHotKernelEx
-   *
-   * @param[in]  indices     A tensor for indices. Data types supported: S32
-   * @param[in]  depth       A tensor for depth. Data types supported: S32
-   * @param[in]  on_value    A tensor for on_value. Data types supported: F32
-   * @param[in]  off_value   A tensor for off_value. Data types supported: F32*
-   * @param[in]  axis        An int value for axis
-   *
-   * @return a status
-   */
-  static Status validate(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                         const ITensor *off_value, const int axis);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-  bool is_parallelisable() const override;
-
-private:
-  /** Template function to run the topKV operation. */
-  template <typename T> void run_one_hot();
-
-  const ITensor *_indices;
-  const ITensor *_depth;
-  const ITensor *_on_value;
-  const ITensor *_off_value;
-  ITensor *_output;
-  int _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CPPONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
index 28114f8b5..933d8760d 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
@@ -53,22 +53,6 @@ class QuantizationInfo;
 namespace arm_compute
 {
 
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
-                             const float32x4_t &scale);
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
-                     const float32x4_t &invscale);
-
-float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale);
-
-void elementwise_op_quantized(
-    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-    uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
-    int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
-                          float32x4_t, float32x4_t, float32x4_t, const bool),
-    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
-                     int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t));
-
 void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
                     float (*scalar_func)(const float &, const float &),
                     int (*broadcast_func)(int, int, int, const float *, const float &, float *,
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
new file mode 100644
index 000000000..101f6ac8e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
+#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class for the kernel converting boolean type
+ */
+class NECastBoolKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NECastBoolKernel"; }
+  /** Default constructor*/
+  NECastBoolKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NECastBoolKernel(const NECastBoolKernel &) = delete;
+  /** Default move constructor */
+  NECastBoolKernel(NECastBoolKernel &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NECastBoolKernel &operator=(const NECastBoolKernel &) = delete;
+  /** Default move assignment operator */
+  NECastBoolKernel &operator=(NECastBoolKernel &&) = default;
+  /** Set the input and output of the kernel
+   *
+   * Valid conversions Input -> Output :
+   *
+   *   - U8             -> U8, S8, U16, S16, U32, S32, F32, F16
+   *
+   * @param[in]  input  The input tensor to convert. Data types supported: U8
+   * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NECastBoolKernel
+   *
+   * @param[in] input  Source tensor info. Data types supported: U8
+   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
index e765aa489..5acfde5a8 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -126,6 +126,7 @@ private:
   const ITensor *_input;
   const ITensor *_indices;
   int _axis;
+  size_t _indices_rank;
   ITensor *_output;
   kernel_ptr _func;
 };
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
new file mode 100644
index 000000000..99bb351bc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
+#define __ARM_COMPUTE_NEONEHOTKERNEL_H__
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Kernel to perform other operation on NEON */
+class NEOneHotKernel : public INEKernel
+{
+public:
+  /** Default constructor. */
+  NEOneHotKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEOneHotKernel(const NEOneHotKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEOneHotKernel &operator=(const NEOneHotKernel &) = delete;
+  /** Allow instances of this class to be moved. */
+  NEOneHotKernel(NEOneHotKernel &&) = default;
+  /** Allow instances of this class to be moved. */
+  NEOneHotKernel &operator=(NEOneHotKernel &&) = default;
+  /** Default detructor */
+  ~NEOneHotKernel() = default;
+  /** Name of the kernel
+   *
+   * @return Kernel name
+   */
+  const char *name() const override { return "NEOneHotKernel"; }
+  /** Initialise the kernel's inputs and outputs
+   *
+ * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up to
+ * 3. Must be one of the following types: U32/S32
+ * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same
+ * as @p on_value
+ * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+ * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                 const ITensor *off_value, ITensor *output, int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEOneHotKernel
+   *
+ * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
+ * up to 3. Must be one of the following types: U32/S32
+ * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+ * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                         const ITensorInfo *on_value, const ITensorInfo *off_value,
+                         const ITensorInfo *output, int axis = -1);
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Implementation of the onehot operation for 0 axis.
+   *
+   * For onehot on the 0 axis an element by element copy is performed.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info);
+  /** Implementation of the onehot operation.
+   *
+   * For 1<=axis a row-wise copy is taking place.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info);
+  using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info);
+  const ITensor *_indices;
+  const ITensor *_depth;
+  const ITensor *_on_value;
+  const ITensor *_off_value;
+  int _axis;
+  ITensor *_output;
+  kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
index a9ceacbea..1e69f0912 100644
--- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -238,6 +238,36 @@ inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
   return output_shape;
 }
 
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis   The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth,
+                                           uint32_t actual_axis)
+{
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions());
+
+  TensorShape output_shape;
+  output_shape.set(actual_axis, depth);
+
+  unsigned int i_shift = 0;
+  for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i)
+  {
+    if (i == actual_axis)
+    {
+      i_shift++;
+    }
+    output_shape.set(i + i_shift, indices_shape[i]);
+  }
+
+  return output_shape;
+}
+
 } // namespace shape_calculator
 } // namespace misc
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index cfbd13436..484ebfd0b 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -16,14 +16,18 @@
 #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
 #define __ARM_COMPUTE_CLFUNCTIONSEX_H__
 
+#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+#include <arm_compute/runtime/CL/functions/CLCastBool.h>
 #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
 #include <arm_compute/runtime/CL/functions/CLGatherEx.h>
 #include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
 #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLNeg.h>
+#include <arm_compute/runtime/CL/functions/CLOneHot.h>
 #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+#include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
 #include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
 
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
new file mode 100644
index 000000000..b1ee52bf9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
+#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+class ICLTensor;
+
+/** Function to calculate the index of the minimum or maximum values in a
+ *  tensor based on an axis.
+ *
+ * @note The default data type for an uninitialized output tensor is
+ *       signed 32-bit integer (S32). It is the user's responsibility to check
+ *       that the results do not overflow because the indices are computed
+ *       in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerEx : public IFunction
+{
+public:
+  /** Default Constructor.
+   *
+   * @param[in] memory_manager (Optional) Memory manager.
+   */
+  CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Input source tensor. Data types supported: QASYMM8/F16/F32.
+   * @param[in]  axis   Axis to find max/min index.
+   * @param[out] output Output source tensor. Data types supported: U32/S32.
+   * @param[in]  op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
+   * ARG_IDX_MIN
+   */
+  void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArgMinMaxLayerEx
+   *
+   * @param[in] input  Input source tensor info. Data types supported: QASYMM8/F16/F32.
+   * @param[in] axis   Axis to find max/min index.
+   * @param[in] output Output source tensor info. Data types supported: U32/S32.
+   * @param[in] op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
+   * ARG_IDX_MIN
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
+                         const ReductionOperation &op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<CLTensor> _results_vector;
+  CLTensor _not_reshaped_output;
+  std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
+  CLReshapeLayerKernel _reshape_kernel;
+  unsigned int _num_of_stages;
+  unsigned int _reduction_axis;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
index 7930e4e20..d6150684a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,31 +38,34 @@
  * SOFTWARE.
  */
 
-#ifndef __ARM_COMPUTE_CPPONEHOT_EX_H__
-#define __ARM_COMPUTE_CPPONEHOT_EX_H__
+/**
+ * @file CLCastBool.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCastBool class
+ */
+
+#ifndef ARM_COMPUTE_CLCASTBOOL_H
+#define ARM_COMPUTE_CLCASTBOOL_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
-class ITensor;
+class ICLTensor;
 
-/** Basic function to run @ref CPPOneHot */
-class CPPOneHotEx : public ICPPSimpleFunction
+/**
+ * @brief Class to run @ref CLCastBoolKernel.
+ * This converts the boolean input tensor to the output tensor's type.
+ */
+class CLCastBool : public ICLSimpleFunction
 {
 public:
-  /** Configure the one_hot function
-   *
-   * @param[in]  indices     A tensor for indices. Data types supported: S32
-   * @param[in]  depth       A tensor for depth. Data types supported: S32
-   * @param[in]  on_value    A tensor for on_value. Data types supported: F32
-   * @param[in]  off_value   A tensor for off_value. Data types supported: F32
-   * @param[out] output      A tensor for computed value of one hot operator
-   * @param[in]  axis        An int value for axis
+  /**
+   * @brief Initialise the kernel's input and output
+   * @param[in]  input   Input tensor. Data types supported: U8
+   * @param[out] output  Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32.
    */
-  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                 const ITensor *off_value, ITensor *output, const int axis);
+  void configure(ICLTensor *input, ICLTensor *output);
 };
 }
-#endif /* __ARM_COMPUTE_CPPONEHOT_EX_H__ */
+#endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
new file mode 100644
index 000000000..2bbfca821
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLONEHOT_H__
+#define __ARM_COMPUTE_CLONEHOT_H__
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+namespace arm_compute
+{
+class ICLTensor;
+/** Basic function to run @ref CLOneHotKernel */
+class CLOneHot : public IFunction
+{
+public:
+  /** Constructor */
+  CLOneHot();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHot(const CLOneHot &) = delete;
+  /** Default move constructor */
+  CLOneHot(CLOneHot &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHot &operator=(const CLOneHot &) = delete;
+  /** Default move assignment operator */
+  CLOneHot &operator=(CLOneHot &&) = default;
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
+                 ICLTensor *output, int depth, int axis = -1);
+  /** Initialise the kernel's inputs and outputs with off_value being constant
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  off_value The PixelValue for off value. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+                 PixelValue off_value, int depth, int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLOneHotKernel
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                         const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                         int axis = -1);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLMemsetKernel _memset_kernel; /**< Memset kernel */
+  CLOneHotKernel _onehot_kernel; /**< OneHot kernel */
+  bool _has_to_memset;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
new file mode 100644
index 000000000..bb741d98d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSPLITVEX__
+#define __ARM_COMPUTE_CLSPLITVEX__
+
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/core/Types.h"
+#include <vector>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSplitVKernel */
+class CLSplitVEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLSplitVEx();
+  /** Configure the split CL kernel
+   *
+   * @param[in]  input       The input tensor to split. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+   * @param[in]  size_splits A 1-D tensor containing the number of tensor values per split
+   * @param[out] outputs     A vector containing the output tensor. Data types supported: Same as @p
+   * input
+   *                         The output tensors should match the input tensor dimensions for all
+   * shape dimensions apart
+   *                         from the split dimension.
+   * @param[in]  split_dim   Integer value representing the input tensor dimension along which to
+   * split
+   * @param[in]  num_splits  Number of splits
+   */
+  void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
+                 const std::vector<ICLTensor *> &outputs, unsigned int num_splits);
+
+  void run() override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_size_splits;
+  std::vector<ICLTensor *> _outputs;
+  unsigned int _num_splits;
+  std::vector<CLSlice> _slice_functions;
+};
+}
+#endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index 3fad230f1..d47b1fe62 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -18,11 +18,13 @@
 
 #include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+#include <arm_compute/runtime/NEON/functions/NECastBool.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
 #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEOneHot.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
 #include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
new file mode 100644
index 000000000..c8b08af8d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTBOOL_H__
+#define __ARM_COMPUTE_NECASTBOOL_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to run @ref NECastBoolKernel.
+ */
+class NECastBool : public INESimpleFunction
+{
+public:
+  /** Initialize the function's source, destination
+   *
+   * Valid conversions Input -> Output :
+   *
+   *   - U8 -> U8, S8, U16, S16, U32, S32, F32, F16
+   *
+   * @param[in]  input  The input tensor to convert. Data types supported: U8
+   * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECastBool
+   *
+   * @param[in] input  Source tensor info. Data types supported: U8.
+   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
new file mode 100644
index 000000000..b2ea6270f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEONEHOT_H__
+#define __ARM_COMPUTE_NEONEHOT_H__
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Basic function to run @ref NEOneHotKernel */
+class NEOneHot : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up
+   * to 3. Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * The value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                 const ITensor *off_value, ITensor *output, int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEOneHotKernel
+   *
+ * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
+ * up to 3. Must be one of the following types: U32/S32
+ * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+ * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                         const ITensorInfo *on_value, const ITensorInfo *off_value,
+                         const ITensorInfo *output, int axis = -1);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEONEHOT_H__ */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index ba42a2456..81d0cb70f 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -55,7 +55,12 @@ using namespace arm_compute;
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
     // ARMComputeEx kernels
+    {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
+    {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
+    {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
+    {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
     {"binary_logical_op", "binary_logical_op.cl"},
+    {"cast_bool", "cast.cl"},
     {"embedding_lookup", "embedding_lookup.cl"},
     {"gather_ex", "gather_ex.cl"},
     {"gather_ex_1d", "gather_ex.cl"},
@@ -65,6 +70,8 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"instance_normalization_ex", "instance_normalization_ex.cl"},
     {"multiply_scale_factor", "multiply_scale_factor.cl"},
     {"neg_tensor", "neg_tensor.cl"},
+    {"one_hot", "one_hot.cl"},
+    {"one_hot_only_on_value", "one_hot.cl"},
     {"quantization_symm8", "quantization_symm8.cl"},
     {"reduce_min_max", "reduce_operation.cl"},
     {"reduce_sum_mean", "reduce_operation.cl"},
@@ -83,6 +90,14 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
+        "arg_min_max_ex.cl",
+#include "./cl_kernels/arg_min_max_ex.clembed"
+    },
+    {
+        "cast.cl",
+#include "./cl_kernels/cast.clembed"
+    },
+    {
         "embedding_lookup.cl",
 #include "./cl_kernels/embedding_lookup.clembed"
     },
@@ -123,6 +138,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/neg_tensor.clembed"
     },
     {
+        "one_hot.cl",
+#include "./cl_kernels/one_hot.clembed"
+    },
+    {
         "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
     },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
new file mode 100644
index 000000000..0a014d15c
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) \
+  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
+#define ISLESS(x, y) \
+  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
+#if defined(WIDTH)
+#if defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
+                                             __global const DATA_TYPE_OUTPUT *prev_res,
+                                             const int x_idx)
+{
+  int end_elem = (x_idx + 1) * 16;
+  if (end_elem > WIDTH)
+  {
+    end_elem = WIDTH - x_idx * 16;
+  }
+  DATA_TYPE_OUTPUT res = prev_res[0];
+  for (int x_v = 1; x_v < end_elem; ++x_v)
+  {
+    res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
+  }
+  return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+  DATA_TYPE_OUTPUT res = 0;
+  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+  {
+    res = select(res, x_v, *(input + x_v) < *(input + res));
+  }
+  return res;
+#else  // WIDTH >= 16
+  int x_elem = x_idx * 16;
+  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+  x_elem -= x_goback;
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  in = vload16(0, input - x_goback);
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+  idx_sel = (in.s01234567 <= in.s89abcdef);
+  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
+  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+  idx_sel.s0123 = (in.s0123 < in.s4567) ||
+                  (in.s0123 == in.s4567 &&
+                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
+  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+  idx_sel.s01 =
+      (in.s01 < in.s23) ||
+      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+  in.s01 = select(in.s23, in.s01, idx_sel.s01);
+  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+  idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+  return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MIN)
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
+                                             __global const DATA_TYPE_OUTPUT *prev_res,
+                                             const int x_idx)
+{
+  int end_elem = (x_idx + 1) * 16;
+  if (end_elem > WIDTH)
+  {
+    end_elem = WIDTH - x_idx * 16;
+  }
+  DATA_TYPE_OUTPUT res = prev_res[0];
+  unsigned int res_int = res;
+  DATA_TYPE_OUTPUT condition_check2;
+  for (int x_v = 1; x_v < end_elem; ++x_v)
+  {
+    int i1 = prev_res[x_v];
+    condition_check2 = *(input + i1) > *(input + res_int);
+    res = select(res, prev_res[x_v], condition_check2);
+  }
+  return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+  DATA_TYPE_OUTPUT res = 0;
+  unsigned int i1;
+  unsigned int i2;
+  DATA_TYPE_OUTPUT condition_check;
+  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+  {
+    i1 = x_v;
+    i2 = res;
+    condition_check = *(input + i1) > *(input + i2);
+    res = select(res, x_v, condition_check);
+  }
+  return res;
+#else  // WIDTH >= 16
+  int x_elem = x_idx * 16;
+  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+  x_elem -= x_goback;
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  in = vload16(0, input - x_goback);
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+  idx_sel = (in.s01234567 >= in.s89abcdef);
+  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
+  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+  idx_sel.s0123 = (in.s0123 > in.s4567) ||
+                  (in.s0123 == in.s4567 &&
+                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
+  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+  idx_sel.s01 =
+      (in.s01 > in.s23) ||
+      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+  in.s01 = select(in.s23, in.s01, idx_sel.s01);
+  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+  idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+  return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX)
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
+ * using -DPREV_OUTPUT
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
+ * -DDATA_TYPE_OUTPUT=uint
+ * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
+ * ArgMax
+ * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
+ * ArgMin
+ *
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] src_stride_x                              Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] src_step_x                                src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the
+ * source tensor
+ * @param[in] prev_res_ptr                              (Optional) Pointer to previous results
+ * tensor. Supported data types: U32/S32
+ * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X
+ * dimension (in bytes)
+ * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y
+ * dimension (in bytes)
+ * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element
+ * in the previous results tensor
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result
+ * values. Supported data types: U32/S32
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension
+ * (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension
+ * (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
+ * source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
+ */
+__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
+#if defined(PREV_OUTPUT)
+                               IMAGE_DECLARATION(prev_res),
+#endif // defined(PREV_OUTPUT)
+                               IMAGE_DECLARATION(partial_res),
+                               __local DATA_TYPE_OUTPUT *local_results)
+{
+#if defined(PREV_OUTPUT)
+  Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+  Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
+#else  // !defined(PREV_OUTPUT)
+  Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#endif // defined(PREV_OUTPUT)
+  Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+
+  unsigned int lsize = get_local_size(0);
+  unsigned int lid = get_local_id(0);
+
+  const uint x_idx = get_global_id(0);
+  const uint y_idx = get_global_id(1);
+  const __global DATA_TYPE *src_in_row =
+      (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes +
+                                   y_idx * src_step_y);
+
+  for (unsigned int y = 0; y < get_local_size(1); ++y)
+  {
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_max_prev_out(
+        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#else  // defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_min_prev_out(
+        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Looking for the next highest power of 2 (maximum value of lsize is 8)
+    unsigned int middle = lsize - 1;
+    middle |= middle >> 1;
+    middle |= middle >> 2;
+    middle += 1;
+    // Perform parallel reduction
+    DATA_TYPE_OUTPUT condition_check3;
+    for (unsigned int i = middle; i > 0; i >>= 1)
+    {
+      if (lid < i && lid + i < lsize)
+      {
+        DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
+        DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
+#if defined(ARG_MAX)
+        condition_check3 =
+            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
+        local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
+#else  // defined(ARG_MIN)
+        local_results[lid] = select(
+            local_results[lid], local_results[lid + i],
+            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+    {
+      ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+    }
+  }
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
+ * -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
+ * -DDATA_TYPE_OUTPUT=uint
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
+{
+  Image src = CONVERT_TO_IMAGE_STRUCT(src);
+  Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  indx = 0;
+  for (unsigned int y = 1; y < HEIGHT; ++y)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in =
+        CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+    indx = select(indx, y, cond_conv);
+    res = select(res, in, CONDITION_TO_USE(in, res));
+  }
+
+  // Store result
+  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
+                VEC_DATA_TYPE(DATA_TYPE, 16));
+
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  indx = 0;
+  for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
+                 VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+    indx = select(indx, z, cond_conv);
+    res = select(res, in, CONDITION_TO_USE(in, res));
+  }
+
+  // Store result
+  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
+                VEC_DATA_TYPE(DATA_TYPE, 16));
+
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  indx = 0;
+  for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
+                 VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+    indx = select(indx, w, cond_conv);
+    res = select(res, in, CONDITION_TO_USE(in, res));
+  }
+
+  // Store result
+  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..3b0a175a4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs a up-scaling depth conversion for boolean type input.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and
+ * -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note The integer shift amount value need to be passed at compile time using -DSHIFT:
+ * e.g. -DSHIFT=7
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
+ * U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ * types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out))
+{
+  // Get pixels pointer
+  Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+  // Load data
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+   (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
new file mode 100644
index 000000000..c274aba62
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
+
+/** Performs the OneHot operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  indices_ptr                              Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in]  indices_stride_x                         Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in]  indices_step_x                           indices_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  indices_stride_y                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_y                           indices_stride_y * number of elements along
+ * Y processed per work item (in bytes)
+ * @param[in]  indices_stride_z                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_z                           indices_stride_z * number of elements along
+ * Z processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes    Offset of the first element in the source
+ * tensor
+ * @param[in]  on_value_ptr                             Pointer to the on_value vector. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32.
+ * @param[in]  on_value_stride_x                        Stride of the on_value vector in X dimension
+ * (in bytes)
+ * @param[in]  on_value_step_x                          on_value_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  on_value_offset_first_element_in_bytes   Offset of the first element in the on_value
+ * vector
+ * @param[in]  off_value_ptr                            Pointer to the off_value vector. Supported
+ * data types: Same as @p on_value.
+ * @param[in]  off_value_stride_x                       Stride of the off_value vector in X
+ * dimension (in bytes)
+ * @param[in]  off_value_step_x                         off_value_stride_x * number of elements
+ * along X processed per work item (in bytes)
+ * @param[in]  off_value_offset_first_element_in_bytes  Offset of the first element in the off_value
+ * vector
+ * @param[out] output_ptr                               Pointer to the destination tensor. Supported
+ * data types: same as @p on_value
+ * @param[in]  output_stride_x                          Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]  output_step_x                            output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_y                          Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                            output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_z                          Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in]  output_step_z                            output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_w                          Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in]  output_step_w                            output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes     Offset of the first element in the
+ * destination tensor
+ */
+__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
+                      VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output))
+{
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+  const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw);
+  *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw);
+  *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw);
+  *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 3
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+  *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#endif // AXIS
+}
+
+/** Performs the OneHot operation along the chosen axis as off_value being zero
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  indices_ptr                              Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in]  indices_stride_x                         Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in]  indices_step_x                           indices_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  indices_stride_y                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_y                           indices_stride_y * number of elements along
+ * Y processed per work item (in bytes)
+ * @param[in]  indices_stride_z                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_z                           indices_stride_z * number of elements along
+ * Z processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes    Offset of the first element in the source
+ * tensor
+ * @param[in]  on_value_ptr                             Pointer to the on_value vector. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32.
+ * @param[in]  on_value_stride_x                        Stride of the on_value vector in X dimension
+ * (in bytes)
+ * @param[in]  on_value_step_x                          on_value_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  on_value_offset_first_element_in_bytes   Offset of the first element in the on_value
+ * vector
+ * @param[out] output_ptr                               Pointer to the destination tensor. Supported
+ * data types: same as @p on_value
+ * @param[in]  output_stride_x                          Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]  output_step_x                            output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_y                          Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                            output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_z                          Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in]  output_step_z                            output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_w                          Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in]  output_step_w                            output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes     Offset of the first element in the
+ * destination tensor
+ */
+__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
+                                    TENSOR4D_DECLARATION(output))
+{
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2);
+
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z);
+
+  const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz);
+
+  if (index < 0 || index >= DEPTH)
+    return;
+
+#if AXIS == 0
+  *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 1
+  *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 2
+  *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 3
+  *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#endif // AXIS
+}
+
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
new file mode 100644
index 000000000..047004d5e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int vector_size = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output,
+                          const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
+                                      op != ReductionOperation::ARG_IDX_MIN,
+                                  "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                  "Reduction axis greater than max number of dimensions");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32,
+                                                         DataType::S64);
+  }
+  if (prev_output != nullptr && prev_output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32,
+                                                         DataType::S32, DataType::S64);
+    if (output->total_size() != 0)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
+    }
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
+                                                         ITensorInfo *prev_output,
+                                                         ITensorInfo *output, unsigned int axis,
+                                                         ReductionOperation op)
+{
+  ARM_COMPUTE_UNUSED(op);
+  // Output tensor auto initialization if not yet initialized
+  TensorShape output_shape{input->tensor_shape()};
+  output_shape.set(axis, 1);
+  DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
+  auto_init_if_empty(*output, input->clone()
+                                  ->set_tensor_shape(output_shape)
+                                  .set_data_type(output_data_type)
+                                  .reset_padding()
+                                  .set_is_resizable(true));
+
+  Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input),
+                                    Steps(vector_size));
+  bool window_changed = false;
+
+  switch (axis)
+  {
+    case 0:
+    {
+      ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input;
+      AccessWindowStatic input_access(input_tensor_access, 0, 0,
+                                      static_cast<int>(input_tensor_access->dimension(0)), 1);
+      AccessWindowHorizontal output_access(output, 0, 1);
+      window_changed = update_window_and_padding(win, input_access, output_access);
+      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    break;
+    case 1:
+    case 2:
+    case 3:
+    {
+      AccessWindowHorizontal input_access(input, 0, vector_size);
+      AccessWindowHorizontal output_access(output, 0, vector_size);
+      window_changed = update_window_and_padding(win, input_access, output_access);
+      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    break;
+    default:
+      ARM_COMPUTE_ERROR("Not supported");
+  }
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
+    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
+      _op(ReductionOperation::ARG_IDX_MAX)
+{
+}
+
+void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output,
+                                         ICLTensor *output, unsigned int axis,
+                                         ReductionOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
+                         output->info(), axis, op));
+  auto win_config = validate_and_configure_window(
+      input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
+      op);
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  _input = input;
+  _prev_output = prev_output;
+  _output = output;
+  _reduction_axis = axis;
+  _op = op;
+
+  // Set build options
+  CLBuildOptions build_opts;
+
+  build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
+  build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
+  build_opts.add_option("-DDATA_TYPE_OUTPUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_SELECT=" +
+                        get_cl_signed_type_from_element_size(input->info()->element_size()));
+
+  // Create kernel
+  cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
+  std::string kernel_axis_name;
+  switch (axis)
+  {
+    case 0:
+    {
+      const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
+      build_opts.add_option("-DWIDTH=" +
+                            support::cpp11::to_string(input_for_width->info()->dimension(0)));
+
+      kernel_axis_name = "x";
+      lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0),
+                                                          vector_size);
+    }
+    break;
+    case 1:
+      build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+      kernel_axis_name = "y";
+      break;
+    case 2:
+      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+      kernel_axis_name = "z";
+      break;
+    case 3:
+      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+      build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
+      kernel_axis_name = "w";
+      break;
+    default:
+      ARM_COMPUTE_ERROR("Not supported");
+  }
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+      "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
+
+  // Configure kernel window
+  ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+}
+
+Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output,
+                                          const ITensorInfo *output, unsigned int axis,
+                                          ReductionOperation op)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
+      output->clone().get(), axis, op)));
+  return Status{};
+}
+
+void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  switch (_reduction_axis)
+  {
+    case 0:
+    {
+      // Set out window
+      Window out_window(window);
+      out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+      // Get first input and output slices
+      Window in_slice = window.first_slice_window_2D();
+      Window out_slice = out_window.first_slice_window_2D();
+
+      // Reshape window
+      const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
+
+      // Set local sums buffer
+      unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
+      _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
+      do
+      {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        if (_prev_output != nullptr)
+        {
+          add_2D_tensor_argument(idx, _prev_output, in_slice);
+        }
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+    }
+    break;
+    case 1:
+    {
+      // Get first input and output slices
+      Window window_in{window};
+      window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1),
+                                                    _input->info()->dimension(1)));
+      Window in_slice = window_in.first_slice_window_2D();
+      Window out_slice = window.first_slice_window_2D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window_in.slide_window_slice_2D(in_slice) &&
+               window.slide_window_slice_2D(out_slice));
+    }
+    break;
+    case 2:
+    {
+      // Get first input and output slices
+      Window window_in{window};
+      window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2),
+                                                    _input->info()->dimension(2)));
+      Window in_slice = window_in.first_slice_window_3D();
+      Window out_slice = window.first_slice_window_3D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window_in.slide_window_slice_3D(in_slice) &&
+               window.slide_window_slice_3D(out_slice));
+    }
+    break;
+    case 3:
+    {
+      // Get first input and output slices
+      Window window_in{window};
+      window_in.set(3, Window::Dimension(0, 1, 1));
+      Window in_slice = window_in.first_slice_window_4D();
+      Window out_slice = window.first_slice_window_4D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, in_slice);
+        add_4D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window_in.slide_window_slice_4D(in_slice) &&
+               window.slide_window_slice_4D(out_slice));
+    }
+    break;
+    default:
+      ARM_COMPUTE_ERROR("Not supported");
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
new file mode 100644
index 000000000..6e0bcde7f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "support/StringSupport.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+                                                       DataType::S16, DataType::U16, DataType::U32,
+                                                       DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(),
+                                  "Input and output data types must be different");
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
+  // must be given)
+  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  // Get number of elements to process per iterations
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DVEC_SIZE=" +
+                        support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_OUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+
+  // Create kernel
+  const std::string kernel_name = "cast_bool";
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+
+  // Configure kernel
+  ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+  // Collapse window
+  const Window &full_window = window();
+  Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+  ICLKernel::configure_internal(collapsed_window);
+
+  // Set config_id for enabling LWS tuning
+  _config_id = kernel_name;
+  _config_id += "_";
+  _config_id += lower_string(string_from_data_type(output->info()->data_type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(output->info()->dimension(0));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+  return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
new file mode 100644
index 000000000..35d70d689
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "support/StringSupport.h"
+#include <string>
+namespace arm_compute
+{
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value,
+                                 const ITensorInfo *output, int depth, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output);
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions());
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
+                                                       DataType::U16, DataType::S16, DataType::F16,
+                                                       DataType::U32, DataType::S32, DataType::F32);
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
+        indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
+                                                        const ITensorInfo *on_value,
+                                                        ITensorInfo *output, int depth, int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices);
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+  // Output auto initialization if not yet initialized
+  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
+      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+  auto_init_if_empty((*output), output_shape, 1, on_value->data_type());
+  // Create window
+  Window win = calculate_max_window(*output, Steps());
+  output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+CLOneHotKernel::CLOneHotKernel()
+    : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
+      _is_off_value_memset(false)
+{
+}
+void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
+                               const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
+{
+  _is_off_value_memset = false;
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info());
+  ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+  _off_value = off_value;
+  configure_common(indices, on_value, output, depth, axis);
+}
+void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
+                               ICLTensor *output, int depth, int axis)
+{
+  _is_off_value_memset = true;
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output);
+  configure_common(indices, on_value, output, depth, axis);
+}
+void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value,
+                                      ICLTensor *output, int depth, int axis)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  if (_is_off_value_memset)
+  {
+    // Replace window with calculated by infices info
+    win_config.second = calculate_max_window(*indices->info(), Steps());
+  }
+  _indices = indices;
+  _on_value = on_value;
+  _output = output;
+  const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(
+                                             data_size_from_type(on_value->info()->data_type())));
+  build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis));
+  build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
+  build_opts.add_option("-DOUTPUT_DIM_Z=" +
+                        support::cpp11::to_string(output->info()->dimension(2)));
+  // Create kernel
+  const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot";
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+  ICLKernel::configure_internal(win_config.second);
+}
+Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                                const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                                int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value);
+  ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
+                                                            on_value->clone().get(),
+                                                            output->clone().get(), depth, axis)
+                                  .first);
+  return Status{};
+}
+Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                                const ITensorInfo *output, int depth, int axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
+                                                            on_value->clone().get(),
+                                                            output->clone().get(), depth, axis)
+                                  .first);
+  return Status{};
+}
+void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  unsigned int idx = 0;
+  add_3D_tensor_argument(idx, _indices, window_collapsed);
+  add_1D_tensor_argument(idx, _on_value, window_collapsed);
+  if (!_is_off_value_memset)
+  {
+    add_1D_tensor_argument(idx, _off_value, window_collapsed);
+  }
+  add_4D_tensor_argument(idx, _output, window_collapsed);
+  enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp
deleted file mode 100644
index 480532388..000000000
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-namespace arm_compute
-{
-CPPOneHotKernelEx::CPPOneHotKernelEx()
-    : _indices(nullptr), _depth(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
-      _axis(-1)
-{
-}
-
-void CPPOneHotKernelEx::configure(const ITensor *indices, const ITensor *depth,
-                                  const ITensor *on_value, const ITensor *off_value,
-                                  ITensor *output, const int axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate(indices, depth, on_value, off_value, axis));
-
-  _indices = indices;
-  _depth = depth;
-  _on_value = on_value;
-  _off_value = off_value;
-  _output = output;
-  _axis = axis;
-
-  ICPPKernel::configure(Window()); // Default 1 iteration window
-}
-
-Status CPPOneHotKernelEx::validate(const ITensor *indices, const ITensor *depth,
-                                   const ITensor *on_value, const ITensor *off_value,
-                                   const int axis)
-{
-  ARM_COMPUTE_UNUSED(on_value, off_value);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(indices, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(depth, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->info()->num_dimensions() != 1,
-                                  "Only 1D indices are supported.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != -1, "Only axis = -1 is supported.");
-  return Status{};
-}
-
-bool CPPOneHotKernelEx::is_parallelisable() const { return false; }
-
-void CPPOneHotKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
-
-  const auto num_indices = _indices->info()->dimension(0);
-  const auto depth = *reinterpret_cast<int32_t *>(_depth->ptr_to_element(Coordinates{0}));
-  const auto dtype = _output->info()->data_type();
-  switch (dtype)
-  {
-    case DataType::F32:
-    {
-      const auto on_value = *reinterpret_cast<float *>(_on_value->ptr_to_element(Coordinates{0}));
-      const auto off_value = *reinterpret_cast<float *>(_off_value->ptr_to_element(Coordinates{0}));
-      for (size_t i = 0; i < num_indices; ++i)
-      {
-        const auto index = *reinterpret_cast<int32_t *>(_indices->ptr_to_element(Coordinates{i}));
-        for (int d = 0; d < depth; ++d)
-          *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(d, i))) =
-              (d == index) ? on_value : off_value;
-      }
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data type.");
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index 254c33ea9..dfe5d59b0 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -49,12 +49,6 @@
 
 namespace
 {
-void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out)
-{
-  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
-  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
-  vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
 
 using namespace arm_compute;
 template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
@@ -151,195 +145,6 @@ void elementwise_op_templ(
 namespace arm_compute
 {
 
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
-                             const float32x4_t &scale)
-{
-  qasymm8x16_t x = vld1q_u8(input1_ptr);
-  const float32x4x4_t out = {{
-      vmulq_f32(
-          vcvtq_f32_s32(vsubq_s32(
-              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
-          scale),
-      vmulq_f32(
-          vcvtq_f32_s32(vsubq_s32(
-              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
-          scale),
-      vmulq_f32(
-          vcvtq_f32_s32(vsubq_s32(
-              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
-          scale),
-      vmulq_f32(
-          vcvtq_f32_s32(vsubq_s32(
-              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
-          scale),
-  }};
-  return out;
-}
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
-                     const float32x4_t &invscale)
-{
-  int32x4x4_t out = {{
-      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-  }};
-  store_quantized_int32(output_ptr, out);
-}
-
-float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale)
-{
-  const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
-  const int32x4_t voffset = vdupq_n_s32(offset);
-  const float32x4_t vscale = vdupq_n_f32(scale);
-
-  const float32x4x4_t broadcast_vector = {{
-      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
-                                            vmovl_u8(vget_low_u8(broadcast_value_vec))))),
-                                        voffset)),
-                vscale),
-      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
-                                            vmovl_u8(vget_low_u8(broadcast_value_vec))))),
-                                        voffset)),
-                vscale),
-      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
-                                            vmovl_u8(vget_high_u8(broadcast_value_vec))))),
-                                        voffset)),
-                vscale),
-      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
-                                            vmovl_u8(vget_high_u8(broadcast_value_vec))))),
-                                        voffset)),
-                vscale),
-  }};
-  return broadcast_vector;
-}
-
-void elementwise_op_quantized(
-    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-    uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
-    int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
-                          float32x4_t, float32x4_t, float32x4_t, const bool),
-    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
-                     int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
-{
-  // Create input windows
-  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-  // Clear X Dimension on execution window as we handle manually
-  Window win = window;
-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  const int window_step_x = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-  const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
-
-  UniformQuantizationInfo qinfo = out->info()->quantization_info().uniform();
-  const float output_scale = qinfo.scale;
-  const int output_offset = qinfo.offset;
-
-  // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from
-  // zero)
-  const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f);
-  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
-
-  if (is_broadcast_across_x)
-  {
-    // Select the broadcast input on the X axis
-    const bool is_broadcast_input_2 = input2_win.x().step() == 0;
-    Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
-    Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
-    const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
-    const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-    const UniformQuantizationInfo broadcast_qinfo =
-        broadcast_tensor->info()->quantization_info().uniform();
-    const UniformQuantizationInfo non_broadcast_qinfo =
-        non_broadcast_tensor->info()->quantization_info().uniform();
-
-    const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-    const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-    // Clear X Dimension on execution window as we handle manually
-    non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator broadcast_input(broadcast_tensor, broadcast_win);
-    Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-    Iterator output(out, win);
-
-    execute_window_loop(
-        win,
-        [&](const Coordinates &) {
-          const auto non_broadcast_input_ptr =
-              reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-          const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-          const float32x4x4_t broadcast_vector =
-              dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale);
-
-          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
-                                    non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                    voffset_non_broadcast, vscale_non_broadcast, voffseto,
-                                    invvscaleo, !is_broadcast_input_2);
-          for (; x < window_end_x; ++x)
-          {
-            const float afs =
-                dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-            const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-            *(output_ptr + x) =
-                (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
-                               out->info()->quantization_info());
-          }
-        },
-        broadcast_input, non_broadcast_input, output);
-  }
-  else
-  {
-    // Input1 quantization info
-    UniformQuantizationInfo qinfo = in1->info()->quantization_info().uniform();
-    const int32x4_t voffset1 = vdupq_n_s32(qinfo.offset);
-    const float32x4_t vscale1 = vdupq_n_f32(qinfo.scale);
-
-    // Input2 quantization info
-    qinfo = in2->info()->quantization_info().uniform();
-    const int32x4_t voffset2 = vdupq_n_s32(qinfo.offset);
-    const float32x4_t vscale2 = vdupq_n_f32(qinfo.scale);
-
-    // Clear X Dimension on execution window as we handle manually
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
-    const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
-
-    Iterator input1(in1, input1_win);
-    Iterator input2(in2, input2_win);
-    Iterator output(out, win);
-
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-                          const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-                          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
-                                               input1_ptr, input2_ptr, output_ptr, voffset1,
-                                               voffset2, vscale1, vscale2, voffseto, invvscaleo);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                            const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                            *(output_ptr + x) =
-                                (*scalar_func)(afs, bfs, out->info()->quantization_info());
-                          }
-                        },
-                        input1, input2, output);
-  }
-}
-
 void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
                     float (*scalar_func)(const float &, const float &),
                     int (*broadcast_func)(int, int, int, const float *, const float &, float *,
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
new file mode 100644
index 000000000..12017e543
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/SaturateCast.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+                                                       DataType::S16, DataType::U16, DataType::F16,
+                                                       DataType::U32, DataType::S32, DataType::F32);
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {}
+
+void NECastBoolKernel::configure(const ITensor *input, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
+  // must be given)
+  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+  _input = input;
+  _output = output;
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICPPKernel::configure(win);
+}
+
+Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+  return Status{};
+}
+
+void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
+  ARM_COMPUTE_ERROR_ON(_input == _output);
+
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const int window_step_x = 16;
+
+  Window win{window};
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(_input, win);
+  Iterator output(_output, win);
+
+  const uint8_t true_val = 1;
+  const uint8x8_t mask_bool = vdup_n_u8(true_val);
+
+  switch (_output->info()->data_type())
+  {
+    case DataType::S8:
+    {
+      /* Conversion U8 -> S8 */
+      execute_window_loop(win,
+                          [&](const Coordinates &) {
+                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+                            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+                              vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8(
+                                                           texels_u8, vdupq_n_u8(true_val))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                              *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
+                            }
+                          },
+                          input, output);
+      break;
+    }
+    case DataType::S16:
+    {
+      /* Up-conversion U8 -> S16 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+              vst1q_s16(output_ptr + x, texels.val[0]);
+              vst1q_s16(output_ptr + x + 8, texels.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+    case DataType::S32:
+    {
+      /* Up-conversion U8 -> S32 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+              vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+              vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+              vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+              vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+    case DataType::F32:
+    {
+      /* Up-conversion U8 -> F32 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+              vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+              vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+              vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+              vst1q_f32(output_ptr + x + 12,
+                        vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+              *(output_ptr + x) = static_cast<float>(in);
+            }
+          },
+          input, output);
+      break;
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+    {
+      /* Up-conversion U8 -> F16 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+              vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
+              vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::U8:
+    {
+      /* Conversion U8 -> S8 */
+      execute_window_loop(win,
+                          [&](const Coordinates &) {
+                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+                            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+                              vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                              *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
+                            }
+                          },
+                          input, output);
+      break;
+    }
+    case DataType::U16:
+    {
+      /* Up-conversion U8 -> U16 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
+                                            vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
+
+              vst1q_u16(output_ptr + x, texels.val[0]);
+              vst1q_u16(output_ptr + x + 8, texels.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Output data type not supported");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index 4c0a5e799..93963a504 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -70,7 +70,10 @@ template <typename U> void validate_indices(const ITensor *indices)
 
 } // namespace
 
-NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {}
+NEGatherKernelEx::NEGatherKernelEx()
+    : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
+{
+}
 
 template <typename U>
 inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
@@ -85,10 +88,10 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn
       window,
       [&](const Coordinates &id) {
         Coordinates gather_id(id);
-        gather_id.collapse(_indices->info()->num_dimensions(), 0);
+        gather_id.collapse(_indices_rank);
 
         U new_index;
-        switch (_indices->info()->num_dimensions())
+        switch (_indices_rank)
         {
           case 1:
             new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
@@ -130,10 +133,10 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf
       output_window,
       [&](const Coordinates &id) {
         Coordinates gather_id(id);
-        gather_id.collapse(_indices->info()->num_dimensions(), _axis);
+        gather_id.collapse(_indices_rank, _axis);
 
         U new_index;
-        switch (_indices->info()->num_dimensions())
+        switch (_indices_rank)
         {
           case 1:
             new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
@@ -174,6 +177,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
   _indices = indices;
   _output = output;
   _axis = axis;
+  _indices_rank = indices->info()->num_dimensions();
 
   if (_axis < 0)
   {
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
new file mode 100644
index 000000000..0a11eb509
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+namespace arm_compute
+{
+namespace
+{
+/** Validate the depth
+ *
+ * Validate that depth are not negative
+ *
+ * @param[in] depth Depth tensor.
+ * @param[in] output Output tensor.
+ * @param[in] axis Axis of depth.
+ */
+template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis)
+{
+  ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0);
+  ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) !=
+                       *(reinterpret_cast<U *>(depth->buffer())));
+}
+
+Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth,
+                          const ITensorInfo *on_value, const ITensorInfo *off_value,
+                          const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
+  const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis ||
+                              actual_axis >= static_cast<int>(output->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
+                                                       DataType::U16, DataType::S16, DataType::F16,
+                                                       DataType::U32, DataType::S32, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
+  }
+
+  return Status{};
+}
+
+template <typename U, typename Enable = void> bool isOnValue(U) { return true; }
+
+template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0>
+bool isOnValue(U index, U depth)
+{
+  return index >= 0 && index < depth;
+}
+} // namespace
+
+NEOneHotKernel::NEOneHotKernel()
+    : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1},
+      _output{nullptr}, _func{}
+{
+}
+
+template <typename U>
+void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  // Validate that the depth are not negative
+  validate_depth<U>(_depth, _output, _axis);
+  Window output_window{window};
+  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+  Iterator output_it(_output, output_window);
+  const U off_value = *reinterpret_cast<U *>(_off_value->buffer());
+  execute_window_loop(
+      output_window,
+      [&](const Coordinates &id) {
+        std::fill_n(output_it.ptr(),
+                    _output->info()->dimension(0) * _output->info()->element_size(), off_value);
+        Coordinates indices_id(id);
+        indices_id.remove(0);
+        const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+        {
+          Coordinates onehot_id(id);
+          onehot_id.set(0, new_index);
+          std::copy_n(_on_value->buffer(), _output->info()->element_size(),
+                      _output->ptr_to_element(onehot_id));
+        }
+      },
+      output_it);
+}
+
+template <typename U>
+inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  // Validate that the indices are not negative
+  validate_depth<U>(_depth, _output, _axis);
+  Iterator output_it(_output, window);
+  execute_window_loop(window,
+                      [&](const Coordinates &id) {
+                        Coordinates indices_id(id);
+                        indices_id.remove(_axis);
+                        const U new_index =
+                            *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+                        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+                        {
+                          Coordinates onehot_id(id);
+                          onehot_id.set(_axis, new_index);
+                          std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
+                                                                             : _off_value->buffer(),
+                                      _output->info()->element_size(), output_it.ptr());
+                        }
+                      },
+                      output_it);
+}
+
+void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth,
+                               const ITensor *on_value, const ITensor *off_value, ITensor *output,
+                               int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
+  ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(),
+                                                off_value->info(), output->info(), axis));
+  _indices = indices;
+  _depth = depth;
+  _on_value = on_value;
+  _off_value = off_value;
+  _output = output;
+  _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
+  if (0 == _axis)
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEOneHotKernel::onehot_0_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEOneHotKernel::onehot_0_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  else
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEOneHotKernel::onehot_n_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEOneHotKernel::onehot_n_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  // Create window
+  Window win = calculate_max_window(*output->info(), Steps());
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+  INEKernel::configure(win);
+}
+
+Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                                const ITensorInfo *on_value, const ITensorInfo *off_value,
+                                const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_arguments(indices, depth, on_value, off_value, output, axis));
+  return Status{};
+}
+
+void NEOneHotKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+  (this->*_func)(window, info);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
new file mode 100644
index 000000000..267228eac
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/Utils.h"
+
+namespace arm_compute
+{
+CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
+      _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+{
+}
+
+Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
+                                    const ReductionOperation &op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
+                                      op != ReductionOperation::ARG_IDX_MIN,
+                                  "Invalid reduction operation");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                  "Reduction axis greater than max number of dimensions");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+  const unsigned int num_of_stages =
+      calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+
+  DataType output_data_type = DataType::S32;
+  TensorInfo not_reshaped_output;
+  const auto input_num_channles = input->num_channels();
+  const auto input_qinfo = input->quantization_info();
+
+  if (output->total_size() != 0)
+  {
+    output_data_type = output->data_type();
+    const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis,
+                                                                   false));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+  }
+
+  auto shape_before_reshape = input->tensor_shape();
+  shape_before_reshape.set(axis, 1);
+  auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
+                                  int num_channels, QuantizationInfo qinfo) {
+    ti.set_data_type(data_type)
+        .set_tensor_shape(shape)
+        .set_num_channels(num_channels)
+        .set_quantization_info(qinfo);
+  };
+
+  initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
+                        input_num_channles, input_qinfo);
+
+  if (num_of_stages == 1)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
+  }
+  else
+  {
+    // Create temporary tensor infos
+    std::vector<TensorInfo> sums_vector(num_of_stages - 1);
+
+    // Create intermediate tensor info
+    TensorShape shape{input->tensor_shape()};
+
+    for (unsigned int i = 0; i < num_of_stages - 1; i++)
+    {
+      shape.set(0, ceil(shape.x() / 128.f));
+      sums_vector[i].set_data_type(input->data_type());
+      sums_vector[i].set_tensor_shape(shape);
+      sums_vector[i].set_num_channels(input->num_channels());
+    }
+
+    // Validate ReductionOperation only on first kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
+
+    // Validate ReductionOperation on intermediate stages
+    for (unsigned int i = 1; i < num_of_stages - 1; ++i)
+    {
+      ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1],
+                                                                     &sums_vector[i], axis, op));
+    }
+
+    // Validate ReductionOperation on the last stage
+    const unsigned int last_stage = num_of_stages - 1;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
+        input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
+  }
+  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+  return Status{};
+}
+
+void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output,
+                                   const ReductionOperation &op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+  _reduction_axis = axis;
+
+  const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
+      input->info()->tensor_shape(), axis, false);
+  DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
+                                  ? DataType::S32
+                                  : output->info()->data_type();
+  auto_init_if_empty(*output->info(), input->info()
+                                          ->clone()
+                                          ->set_tensor_shape(output_shape)
+                                          .set_data_type(output_data_type)
+                                          .reset_padding()
+                                          .set_is_resizable(true));
+
+  // Configure reduction operation kernels
+  _reduction_kernels_vector.resize(_num_of_stages);
+
+  _memory_group.manage(&_not_reshaped_output);
+  // Create temporary tensors
+  if (_num_of_stages == 1)
+  {
+    // Force an early initialization for int64 output type
+    TensorShape output_shape{input->info()->tensor_shape()};
+    output_shape.set(axis, 1);
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
+    _not_reshaped_output.info()->set_tensor_shape(output_shape);
+    _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
+  }
+  else
+  {
+    _results_vector.resize(_num_of_stages - 1);
+    TensorShape shape{input->info()->tensor_shape()};
+    for (unsigned int i = 0; i < _num_of_stages - 1; i++)
+    {
+      shape.set(0, ceil(shape.x() / 128.f));
+      _results_vector[i].allocator()->init(
+          input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
+    }
+
+    // Apply ReductionOperation only on first kernel
+    _memory_group.manage(&_results_vector[0]);
+    _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
+
+    // Apply ReductionOperation on intermediate stages
+    for (unsigned int i = 1; i < _num_of_stages - 1; ++i)
+    {
+      _memory_group.manage(&_results_vector[i]);
+      _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i],
+                                             axis, op);
+      _results_vector[i - 1].allocator()->allocate();
+    }
+
+    // Apply ReductionOperation on the last stage
+    const unsigned int last_stage = _num_of_stages - 1;
+    _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1],
+                                                    &_not_reshaped_output, axis, op);
+    _results_vector[last_stage - 1].allocator()->allocate();
+  }
+  _reshape_kernel.configure(&_not_reshaped_output, output);
+  _not_reshaped_output.allocator()->allocate();
+}
+
+void CLArgMinMaxLayerEx::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _num_of_stages; ++i)
+  {
+    CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+  }
+  CLScheduler::get().enqueue(_reshape_kernel, false);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
new file mode 100644
index 000000000..c7d0ac8e2
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLCastBool.h"
+
+#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
+
+using namespace arm_compute;
+
+void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
new file mode 100644
index 000000000..aa9f32ec6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLOneHot.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
+namespace arm_compute
+{
+CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
+void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value,
+                         const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
+{
+  _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis);
+}
+void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+                         PixelValue off_value, int depth, int axis)
+{
+  _has_to_memset = true;
+  _memset_kernel.configure(output, off_value);
+  _onehot_kernel.configure(indices, on_value, output, depth, axis);
+}
+Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                          const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                          int axis)
+{
+  return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis);
+}
+void CLOneHot::run()
+{
+  if (_has_to_memset)
+  {
+    CLScheduler::get().enqueue(_memset_kernel, true);
+  }
+
+  CLScheduler::get().enqueue(_onehot_kernel, false);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 38401100c..b198e7330 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -120,6 +120,11 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
   const size_t num_of_kernels = axis.size();
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
+  if (num_of_kernels < 1)
+  {
+    throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
+  }
+
   _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
   _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
new file mode 100644
index 000000000..a502f032e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSplitVEx.h"
+#include "support/ToolchainSupport.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include <cassert>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs,
+                          unsigned int num_splits)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1,
+                                  "size_splits must be a 1-D tensor.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(),
+                                  "Number of output tensors does not match number of splits.");
+  return Status{};
+}
+
+Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs,
+                       uint32_t split_dim)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions());
+  ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+  // Start/End coordinates
+  Coordinates start_coords;
+  Coordinates end_coords;
+  for (unsigned int d = 0; d < input->num_dimensions(); ++d)
+  {
+    end_coords.set(d, -1);
+  }
+  unsigned int axis_offset = 0;
+  // Validate output tensors
+  for (const auto &output : outputs)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    // Get output shape
+    const TensorShape output_shape = output->tensor_shape();
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+    const size_t axis_split_step = output_shape[split_dim];
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo tmp_output_info = *output->clone();
+    auto_init_if_empty(tmp_output_info,
+                       input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+    // Update coordinate on axis
+    start_coords.set(split_dim, axis_offset);
+    end_coords.set(split_dim, axis_offset + axis_split_step);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords));
+
+    axis_offset += axis_split_step;
+  }
+
+  return Status{};
+}
+
+void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs,
+                      std::vector<CLSlice> &_slice_functions, uint32_t split_dim)
+{
+  unsigned int axis_offset = 0;
+  // Start/End coordinates
+  Coordinates start_coords;
+  Coordinates end_coords;
+  for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d)
+  {
+    end_coords.set(d, -1);
+  }
+  int out_iter = 0;
+  for (const auto &output : outputs)
+  {
+    const TensorShape output_shape = output->info()->tensor_shape();
+    auto op_size = output_shape.total_size();
+    if (!op_size)
+    {
+      continue;
+    }
+
+    assert(op_size != 0);
+    assert(split_dim <= output_shape.num_dimensions());
+
+    const size_t axis_split_step = output_shape[split_dim];
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo tmp_output_info = *output->info()->clone();
+    auto_init_if_empty(
+        tmp_output_info,
+        input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+    // Update coordinate on axis
+    start_coords.set(split_dim, axis_offset);
+    end_coords.set(split_dim, axis_offset + axis_split_step);
+
+    // Configure slice function
+    _slice_functions[out_iter].configure(input, output, start_coords, end_coords);
+
+    // Set valid region from shape
+    outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+    axis_offset += axis_split_step;
+  }
+}
+
+} // namespace
+
+CLSplitVEx::CLSplitVEx()
+    : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
+{
+}
+
+void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
+                           const std::vector<ICLTensor *> &outputs, unsigned int num_splits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits));
+
+  _input = input;
+  _size_splits = size_splits;
+  _outputs = outputs;
+  _num_splits = num_splits;
+
+  // Create tensor slices
+  _slice_functions.resize(_num_splits);
+
+  // Extract output tensor info
+  std::vector<ITensorInfo *> outputs_info;
+  for (auto &output : _outputs)
+  {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    outputs_info.emplace_back(output->info());
+  }
+
+  // Validate slices
+  ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim));
+
+  // Configure slices
+  configure_slices(_input, _outputs, _slice_functions, split_dim);
+}
+
+void CLSplitVEx::run()
+{
+  // execute the slices
+  for (unsigned i = 0; i < _outputs.size(); ++i)
+  {
+    _slice_functions[i].run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
index 768c15b41..6ad3e1b12 100644
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,18 +37,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/NEON/functions/NECastBool.h"
 
-#include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
-
-#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
+#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
-void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
-                            const ITensor *off_value, ITensor *output, const int axis)
+void NECastBool::configure(const ITensor *input, ITensor *output)
 {
-  auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
-  k->configure(indices, depth, on_value, off_value, output, axis);
+  auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>();
+  k->configure(input, output);
   _kernel = std::move(k);
 }
+
+Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  return NECastBoolKernel::validate(input, output);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
new file mode 100644
index 000000000..275c55024
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEOneHot.h"
+#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
+#include "support/MemorySupport.h"
+#include <utility>
+namespace arm_compute
+{
+void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                         const ITensor *off_value, ITensor *output, int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>();
+  k->configure(indices, depth, on_value, off_value, output, axis);
+  _kernel = std::move(k);
+}
+Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                          const ITensorInfo *on_value, const ITensorInfo *off_value,
+                          const ITensorInfo *output, int axis)
+{
+  return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis);
+}
+} // namespace arm_compute
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
index 609dd45a3..09f67259c 100644
--- a/compute/cker/CMakeLists.txt
+++ b/compute/cker/CMakeLists.txt
@@ -8,9 +8,6 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
 target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
-if(EXPERIMENTAL_RUY_FEATURE)
-  target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
-endif(EXPERIMENTAL_RUY_FEATURE)
 if(PROFILE_RUY)
   target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
 endif(PROFILE_RUY)
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
index 246fd9a46..e6f5c0477 100644
--- a/compute/cker/include/cker/NeonTensorUtils.h
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -20,11 +20,13 @@
 
 #include <ruy/path.h>
 #include <ruy/ruy.h>
-#include <ruy/detect_arm.h>
 #include "cker/Types.h"
 #include "cker/neon/neon_check.h"
 #include "cker/ruy/RuySupport.h"
 #include "util/logging.h"
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
 
 #include <cassert>
 #include <cmath>
@@ -73,14 +75,37 @@ inline int32_t AccumulateNeonLane(const int32x4_t lane)
 
 } // namespace
 
-#ifdef __aarch64__
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+bool DetectDotprodByLinuxAuxvMethod()
+{
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+bool DetectArmNeonDotprod()
+{
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
+
+  return false;
+}
 
 bool HasSdotInstruction()
 {
-  static const bool has_dotprod = ruy::DetectDotprod();
+  static const bool has_dotprod = DetectArmNeonDotprod();
   return has_dotprod;
 }
 
+#ifdef __aarch64__
 // We interleave vector data to make the dot product logic more efficient.
 // Suppose that vectors is:
 //     a0 a1 a2 a3 a4 a5 ...
@@ -552,7 +577,7 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
   lhs_params.order = Order::kRowMajor;
   lhs_params.rows = n_output;
   lhs_params.cols = n_input;
-  lhs_params.cacheable = true;
+  lhs_params.cache_policy = CachePolicy::kAlwaysCache;
 
   MatrixParams<int8_t> rhs_params;
   rhs_params.order = Order::kColMajor;
@@ -574,15 +599,15 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
   ruy::Matrix<int8_t> ruy_lhs;
   ruy::Matrix<int8_t> ruy_rhs;
   ruy::Matrix<int32_t> ruy_dst;
-  ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs);
-  ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs);
+  // Note that cache is always enabled for input and weight tensors
+  ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true);
+  ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true);
   ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);
 
-  ruy::BasicSpec<int32_t, int32_t> ruy_spec;
-  ruy_support::MakeRuySpec(gemm_params, &ruy_spec);
+  ruy::BasicSpec<int32_t, int32_t> ruy_mul_params;
+  ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
 
-  constexpr ruy::Path kRuyPath = ruy::kAllPaths;
-  ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, ruy_context, &ruy_dst);
+  ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
 }
 
 void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index c0c9313ea..add6f83e4 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -372,6 +372,13 @@ enum class Order
   kRowMajor
 };
 
+enum class CachePolicy : std::uint8_t
+{
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
 // MatrixParams encapsulates the parameters that Gemm needs about each
 // matrix, besides the buffer data pointer.
 // Compare to ruy::Matrix, which also encapsulates the data pointer.
@@ -390,10 +397,13 @@ template <typename Scalar> struct MatrixParams
   // The zero_point, i.e. which Scalar value is to be interpreted as zero.
   // When Scalar is floating-point, this must be 0.
   Scalar zero_point = 0;
-  // Indicate whether the underlying data will remain unchanged for
-  // some period of time. Defaults to false, but should be set to true
-  // for unchanging data (e.g. weights buffers in many cases)
-  bool cacheable = false;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
 };
 
 // Enumeration of broad categories of Gemm.
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index 8aef1f8c1..d9917a9da 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -246,9 +246,8 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
                      output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
-      reference::BinaryArithmeticOp<float>(params, input1_shape, input1_data, input2_shape,
-                                           input2_data, output_shape, output_data,
-                                           GetBinaryArtithmeticFn<op_type, float>());
+      optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     default:
       assert(false);
@@ -312,7 +311,13 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
                                       output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::SUB:
+      optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
+      optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
     case nnfw::cker::BinaryArithmeticOpType::POW:
       reference::BroadcastBinaryArithmeticOpSlow<float>(
           params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 4280c9ae2..238bc52f3 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -19,6 +19,7 @@
 #define __NNFW_CKER_FULLY_CONNECTED_H__
 
 #include <ruy/context.h>
+#include "cker/operation/FullyConnectedSparse16x1.h"
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
@@ -208,12 +209,13 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
   return;
 }
 
-inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const Shape &input_shape,
-                                       const float *input_data, const Shape &weights_shape,
-                                       const float *weights_data, const Shape &bias_shape,
-                                       const float *bias_data, const Shape &output_shape,
-                                       float *output_data, int w0_size, const uint16_t *w1_segments,
-                                       const uint16_t *w1_indices)
+inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
+                                             const Shape &input_shape, const float *input_data,
+                                             const Shape &weights_shape, const float *weights_data,
+                                             const Shape &bias_shape, const float *bias_data,
+                                             const Shape &output_shape, float *output_data,
+                                             const uint16_t *w1_segments,
+                                             const uint16_t *w1_indices)
 {
   UNUSED_RELEASE(params);
   UNUSED_RELEASE(input_shape);
@@ -239,7 +241,7 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const
   }
   for (int b = 0; b < batches; ++b)
   {
-    for (int idx_0 = 0; idx_0 < w0_size; ++idx_0)
+    for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
     {
       for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
       {
diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
new file mode 100644
index 000000000..28ae7a3bc
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
+                                           const Shape &input_shape, const float *input_data,
+                                           const Shape &weights_shape, const float *weights_data,
+                                           const Shape &bias_shape, const float *bias_data,
+                                           const Shape &output_shape, float *output_data,
+                                           const uint16_t *w1_segments, const uint16_t *w1_indices)
+{
+  UNUSED_RELEASE(input_shape);
+
+  assert(weights_shape.DimensionsCount() == 2);
+  assert(output_shape.DimensionsCount() == 2);
+
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth =
+      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+
+  UNUSED_RELEASE(bias_shape);
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batches * output_depth);
+  }
+  for (int b = 0; b < batches; ++b)
+  {
+    int depth_size = output_depth / 16;
+    for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
+#ifdef USE_NEON
+    {
+      float *__restrict y;
+      y = &output_data[b * output_depth + idx_0 * 16];
+      /* keep y[0..15] in registers for duration of inner loop */
+      float32x4_t y0_3 = vld1q_f32(&y[0]);
+      float32x4_t y4_7 = vld1q_f32(&y[4]);
+      float32x4_t y8_11 = vld1q_f32(&y[8]);
+      float32x4_t y12_15 = vld1q_f32(&y[12]);
+      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        auto idx_1 = w1_indices[pw1];
+        float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
+        float32x4_t wvec;
+
+        wvec = vld1q_f32(&weights_data[0]);
+        y0_3 = vmlaq_f32(y0_3, wvec, xj);
+        wvec = vld1q_f32(&weights_data[4]);
+        y4_7 = vmlaq_f32(y4_7, wvec, xj);
+        wvec = vld1q_f32(&weights_data[8]);
+        y8_11 = vmlaq_f32(y8_11, wvec, xj);
+        wvec = vld1q_f32(&weights_data[12]);
+        y12_15 = vmlaq_f32(y12_15, wvec, xj);
+
+        weights_data += 16;
+      }
+      /* save y[0..15] back to memory */
+      vst1q_f32(&y[0], y0_3);
+      vst1q_f32(&y[4], y4_7);
+      vst1q_f32(&y[8], y8_11);
+      vst1q_f32(&y[12], y12_15);
+    }
+#else
+    {
+      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        float *__restrict y;
+        float xj;
+        auto idx_1 = w1_indices[pw1];
+        xj = input_data[b * accum_depth + idx_1];
+        y = &output_data[b * output_depth + idx_0 * 16];
+        y[0] += weights_data[0] * xj;
+        y[1] += weights_data[1] * xj;
+        y[2] += weights_data[2] * xj;
+        y[3] += weights_data[3] * xj;
+        y[4] += weights_data[4] * xj;
+        y[5] += weights_data[5] * xj;
+        y[6] += weights_data[6] * xj;
+        y[7] += weights_data[7] * xj;
+        y[8] += weights_data[8] * xj;
+        y[9] += weights_data[9] * xj;
+        y[10] += weights_data[10] * xj;
+        y[11] += weights_data[11] * xj;
+        y[12] += weights_data[12] * xj;
+        y[13] += weights_data[13] * xj;
+        y[14] += weights_data[14] * xj;
+        y[15] += weights_data[15] * xj;
+        weights_data += 16;
+      }
+    }
+#endif
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
+  }
+}
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index cf9634a67..2b2e8d338 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -21,6 +21,7 @@
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
 
 namespace nnfw
 {
@@ -30,6 +31,74 @@ namespace cker
 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
 // This method iterates through input data and reduce elements along the
 // dimensions given in axis.
+
+#ifdef USE_NEON
+inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
+                               float *output_data)
+{
+  const auto input_dims = input_shape.DimsData();
+  const auto input_num_dims = input_shape.DimensionsCount();
+
+  int input_size = 1;
+  int reduce_size = 0;
+  for (int idx = 0; idx < input_num_dims - 1; idx++)
+  {
+    input_size *= input_dims[idx];
+  }
+  reduce_size = input_dims[input_num_dims - 1];
+  for (int idx = 0; idx < input_size; idx++)
+  {
+    int r_idx = 0;
+    float tmp_data[4] = {
+        0,
+    };
+    float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
+    for (; r_idx <= reduce_size - 32; r_idx += 32)
+    {
+      float32x4_t a10 = vld1q_f32(input_data + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + r_idx + 12);
+      float32x4_t a20 = vld1q_f32(input_data + r_idx + 16);
+      float32x4_t a21 = vld1q_f32(input_data + r_idx + 20);
+      float32x4_t a22 = vld1q_f32(input_data + r_idx + 24);
+      float32x4_t a23 = vld1q_f32(input_data + r_idx + 28);
+
+      float32x4_t x0 = vaddq_f32(a10, a20);
+      float32x4_t x1 = vaddq_f32(a11, a21);
+      float32x4_t x2 = vaddq_f32(a12, a22);
+      float32x4_t x3 = vaddq_f32(a13, a23);
+
+      float32x4_t y0 = vaddq_f32(x0, x1);
+      float32x4_t y1 = vaddq_f32(x2, x3);
+      float32x4_t y2 = vaddq_f32(y0, y1);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
+    }
+    for (; r_idx <= reduce_size - 8; r_idx += 8)
+    {
+      float32x4_t a1 = vld1q_f32(input_data + r_idx);
+      float32x4_t a2 = vld1q_f32(input_data + r_idx + 4);
+      float32x4_t x = vaddq_f32(a1, a2);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
+    }
+    vst1q_f32(tmp_data, tmp_data_32x4);
+    output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
+
+    for (; r_idx < reduce_size; r_idx++)
+    {
+      if (r_idx == 0)
+      {
+        output_data[idx] = input_data[idx * reduce_size];
+      }
+      else
+      {
+        output_data[idx] += input_data[idx * reduce_size + r_idx];
+      }
+    }
+  }
+}
+#endif // NEON
+
 template <typename In, typename Out>
 inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
                        const int *axis, const int num_axis, int *input_iter,
@@ -39,6 +108,32 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
   const auto input_num_dims = input_shape.DimensionsCount();
 
   // Reset input iterator.
+  if (num_axis == 1 && axis[0] == input_num_dims - 1)
+  {
+    int input_size = 1;
+    int reduce_size = 0;
+    for (int idx = 0; idx < input_num_dims - 1; idx++)
+    {
+      input_size *= input_dims[idx];
+    }
+    reduce_size = input_dims[input_num_dims - 1];
+    for (int idx = 0; idx < input_size; idx++)
+    {
+      for (int r_idx = 0; r_idx < reduce_size; r_idx++)
+      {
+        if (r_idx == 0)
+        {
+          output_data[idx] = input_data[idx * reduce_size];
+        }
+        else
+        {
+          output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
+        }
+      }
+    }
+    return true;
+  }
+
   for (int idx = 0; idx < input_num_dims; ++idx)
   {
     input_iter[idx] = 0;
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 13e50b87a..0e0f364ba 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -32,6 +32,45 @@ namespace nnfw
 namespace cker
 {
 
+namespace reference
+{
+
+// Note. This Softmax function supports all of dimensions
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c)
+    {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c)
+    {
+      sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta));
+    }
+
+    // Compute result.
+    for (int c = 0; c < depth; ++c)
+    {
+      output_data[i * depth + c] =
+          std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
+    }
+  }
+}
+}
+
 // Performs softmax along the input of size (input_size * batch_size).
 inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
                     float *out)
diff --git a/compute/cker/include/cker/operation/StridedSlice.h b/compute/cker/include/cker/operation/StridedSlice.h
index c57b4daa0..2f1089575 100644
--- a/compute/cker/include/cker/operation/StridedSlice.h
+++ b/compute/cker/include/cker/operation/StridedSlice.h
@@ -260,12 +260,41 @@ template <typename T>
 inline void StridedSlice(const StridedSliceParams &op_params, const Shape &unextended_input_shape,
                          const T *input_data, const Shape &unextended_output_shape, T *output_data)
 {
-  // Note that the output_shape is not used herein.
-  StridedSliceParams params_copy = op_params;
-
   assert(unextended_input_shape.DimensionsCount() <= 4);
   assert(unextended_output_shape.DimensionsCount() <= 4);
 
+  bool optimize = true;
+  int st_count = op_params.strides_count;
+  for (int idx = 0; idx < st_count - 1; idx++)
+  {
+    const int axis_size = unextended_input_shape.Dims(idx);
+    const int start = StartForAxis(op_params, unextended_input_shape, idx);
+    const int stop = StopForAxis(op_params, unextended_input_shape, idx, start);
+    if ((axis_size != 1) && (start != 0 || stop != 0))
+    {
+      optimize = false;
+      break;
+    }
+  }
+
+  if (optimize)
+  {
+    if (op_params.strides[st_count - 1] == 1)
+    {
+      const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1);
+      const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start);
+
+      for (int idx = 0; idx < end - start; idx++)
+      {
+        output_data[idx] = input_data[idx + start];
+      }
+      return;
+    }
+  }
+
+  // Note that the output_shape is not used herein.
+  StridedSliceParams params_copy = op_params;
+
   const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
   const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index ac5069917..912b01a64 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -19,6 +19,8 @@
 #define __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__
 
 #include <functional>
+#include <limits>
+#include <utility>
 #include "cker/neon/neon_check.h"
 #include "cker/operation/reference/BinaryArithmeticOps.h"
 #include "cker/Shape.h"
@@ -34,7 +36,7 @@ namespace optimized
 {
 
 template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
-inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs,
                                     const Shape & /* unswitched_input1_shape */,
                                     const T *unswitched_input1_data,
                                     const Shape & /* unswitched_input2_shape */,
@@ -42,11 +44,8 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
                                     const Shape & /* output_shape */, T *output_data,
                                     ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
 {
-  const bool use_unswitched =
-      params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+  const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data;
+  const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data;
 
   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
@@ -219,8 +218,136 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
   }
 }
 
-inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
-                           const float *input1_data, const float *input2_data, float *output_data)
+struct BinaryOpFuncAddFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vaddq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a + b; }
+};
+
+struct BinaryOpFuncSubFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vsubq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a - b; }
+};
+
+struct BinaryOpFuncMulFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vmulq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a * b; }
+};
+
+struct BinaryOpFuncDivFloat
+{
+#ifdef USE_NEON
+#ifdef __aarch64__
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vdivq_f32(a, b);
+  }
+#endif // __aarch64__
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a / b; }
+};
+
+template <class BASEOPERATOR> struct BinaryOpFuncSwapArgs
+{
+  template <typename T> static inline T calculate(const T &a, const T &b)
+  {
+    return BASEOPERATOR::calculate(b, a);
+  }
+};
+
+struct BinaryOpActivationFloatNone
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    (void)ceilingParam; // suppress unused argument warning
+    return value;
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    (void)floorParam;
+    return value;
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    (void)ceilingParam;
+    return value;
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    (void)floorParam;
+    return value;
+  }
+};
+
+struct BinaryOpActivationFloatMax
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    (void)ceilingParam; // suppress unused argument warning
+    return value;
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    return vmaxq_f32(value, floorParam);
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    (void)ceilingParam;
+    return value;
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    return std::max(value, floorParam);
+  }
+};
+
+struct BinaryOpActivationFloatMinMax
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    return vminq_f32(value, ceilingParam);
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    return vmaxq_f32(value, floorParam);
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    return std::min(value, ceilingParam);
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    return std::max(value, floorParam);
+  }
+};
+
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
+                                const float *input1_data, const float *input2_data,
+                                float *output_data)
 {
   int i = 0;
 
@@ -237,18 +364,18 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
     auto a21 = vld1q_f32(input2_data + i + 4);
     auto a22 = vld1q_f32(input2_data + i + 8);
     auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vaddq_f32(a10, a20);
-    auto x1 = vaddq_f32(a11, a21);
-    auto x2 = vaddq_f32(a12, a22);
-    auto x3 = vaddq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
+    auto x0 = OPERATOR::calculate(a10, a20);
+    auto x1 = OPERATOR::calculate(a11, a21);
+    auto x2 = OPERATOR::calculate(a12, a22);
+    auto x3 = OPERATOR::calculate(a13, a23);
+    x0 = ACTIVATION::applyFloor(x0, activation_min);
+    x1 = ACTIVATION::applyFloor(x1, activation_min);
+    x2 = ACTIVATION::applyFloor(x2, activation_min);
+    x3 = ACTIVATION::applyFloor(x3, activation_min);
+    x0 = ACTIVATION::applyCeiling(x0, activation_max);
+    x1 = ACTIVATION::applyCeiling(x1, activation_max);
+    x2 = ACTIVATION::applyCeiling(x2, activation_max);
+    x3 = ACTIVATION::applyCeiling(x3, activation_max);
     vst1q_f32(output_data + i, x0);
     vst1q_f32(output_data + i + 4, x1);
     vst1q_f32(output_data + i + 8, x2);
@@ -258,20 +385,94 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
   {
     auto a1 = vld1q_f32(input1_data + i);
     auto a2 = vld1q_f32(input2_data + i);
-    auto x = vaddq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
+    auto x = OPERATOR::calculate(a1, a2); // vaddq
+    auto x_clamped =
+        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+    vst1q_f32(output_data + i, x_clamped);
   }
-#endif // NEON
+#endif // USE_NEON
   for (; i < size; i++)
   {
-    auto x = input1_data[i] + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min,
-                                                         params.float_activation_max);
+    auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
+    output_data[i] = ACTIVATION::applyCeiling(
+        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
   }
 }
 
+// Broadcast binary op template that can often be used for inner loop
+// This function will handle scalar_value (LHS) and vector_values (RHS).
+// Since it's a float function, input params does not matter here.
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+                                    const float broadcast_value, const float *input2_data,
+                                    float *output_data)
+{
+  int i = 0;
+
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(params.float_activation_min);
+  const auto activation_max = vdupq_n_f32(params.float_activation_max);
+  const auto broadcast_value_dup = vdupq_n_f32(broadcast_value);
+  for (; i <= size - 16; i += 16)
+  {
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = OPERATOR::calculate(broadcast_value_dup, a20);
+    auto x1 = OPERATOR::calculate(broadcast_value_dup, a21);
+    auto x2 = OPERATOR::calculate(broadcast_value_dup, a22);
+    auto x3 = OPERATOR::calculate(broadcast_value_dup, a23);
+    x0 = ACTIVATION::applyFloor(x0, activation_min);
+    x1 = ACTIVATION::applyFloor(x1, activation_min);
+    x2 = ACTIVATION::applyFloor(x2, activation_min);
+    x3 = ACTIVATION::applyFloor(x3, activation_min);
+    x0 = ACTIVATION::applyCeiling(x0, activation_max);
+    x1 = ACTIVATION::applyCeiling(x1, activation_max);
+    x2 = ACTIVATION::applyCeiling(x2, activation_max);
+    x3 = ACTIVATION::applyCeiling(x3, activation_max);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4)
+  {
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = OPERATOR::calculate(broadcast_value_dup, a2);
+    auto x_clamped =
+        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+    vst1q_f32(output_data + i, x_clamped);
+  }
+#endif // USE_NEON
+  for (; i < size; i++)
+  {
+    auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
+    output_data[i] = ACTIVATION::applyCeiling(
+        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+  }
+}
+
+using BinaryOpImplFloatFuncs =
+    std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
+              void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
+
+template <class FUNC>
+inline BinaryOpImplFloatFuncs
+getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam &params)
+{
+  if (params.float_activation_max == std::numeric_limits<float>::max())
+    if (params.float_activation_min == std::numeric_limits<float>::lowest())
+      return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>,
+                                    BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>);
+    else
+      return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>,
+                                    BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>);
+  else
+    return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>,
+                                  BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
+}
+
 inline void AddQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
                       const uint8_t *input1_data, const Shape &input2_shape,
                       const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
@@ -285,7 +486,8 @@ inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape
                 const Shape &output_shape, float *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 }
 
 // Scalar-broadcast add that can be used for inner loop of more general
@@ -304,33 +506,6 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
-                               float broadcast_value, const float *input2_data, float *output_data)
-{
-  int i = 0;
-#ifdef USE_NEON
-  const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min);
-  const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max);
-  const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
-  for (; i <= size - 4; i += 4)
-  {
-    const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
-
-    const float32x4_t output = vaddq_f32(input2_val_original, broadcast_value_dup);
-
-    const float32x4_t clamped =
-        vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output));
-    vst1q_f32(output_data + i, clamped);
-  }
-#endif // NEON
-  for (; i < size; ++i)
-  {
-    auto x = broadcast_value + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min,
-                                                         params.float_activation_max);
-  }
-}
-
 inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
                                        const Shape &input1_shape, const uint8_t *input1_data,
                                        const Shape &input2_shape, const uint8_t *input2_data,
@@ -350,7 +525,8 @@ inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
   else
   {
     BinaryBroadcastFiveFold(
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+        params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+        input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
         static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
                              uint8_t *)>(AddElementwiseQuant8),
         static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
@@ -372,12 +548,12 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
   }
   else
   {
-    BinaryBroadcastFiveFold(
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *,
-                             float *)>(AddElementwise),
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>(
-            AddScalarBroadcast));
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+
+    BinaryBroadcastFiveFold(params, params.broadcast_category ==
+                                        BroadcastableOpCategory::kSecondInputBroadcastsFast,
+                            input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                            output_data, implFuncs.first, implFuncs.second);
   }
 }
 
@@ -385,54 +561,35 @@ inline void Sub(const BinaryArithmeticOpParam &params, const Shape &input1_shape
                 const float *input1_data, const Shape &input2_shape, const float *input2_data,
                 const Shape &output_shape, float *output_data)
 {
-  int i = 0;
-  const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-#ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(params.float_activation_min);
-  const auto activation_max = vdupq_n_f32(params.float_activation_max);
-  for (; i <= size - 16; i += 16)
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
   {
-    auto a10 = vld1q_f32(input1_data + i);
-    auto a11 = vld1q_f32(input1_data + i + 4);
-    auto a12 = vld1q_f32(input1_data + i + 8);
-    auto a13 = vld1q_f32(input1_data + i + 12);
-    auto a20 = vld1q_f32(input2_data + i);
-    auto a21 = vld1q_f32(input2_data + i + 4);
-    auto a22 = vld1q_f32(input2_data + i + 8);
-    auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vsubq_f32(a10, a20);
-    auto x1 = vsubq_f32(a11, a21);
-    auto x2 = vsubq_f32(a12, a22);
-    auto x3 = vsubq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
-    vst1q_f32(output_data + i, x0);
-    vst1q_f32(output_data + i + 4, x1);
-    vst1q_f32(output_data + i + 8, x2);
-    vst1q_f32(output_data + i + 12, x3);
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+    BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
   }
-  for (; i <= size - 4; i += 4)
+  else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
   {
-    auto a1 = vld1q_f32(input1_data + i);
-    auto a2 = vld1q_f32(input2_data + i);
-    auto x = vsubq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
+    auto implFuncs =
+        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
+    BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
   }
-#endif // NEON
-
-  for (; i < size; i++)
+  else
   {
-    auto x = input1_data[i] - input2_data[i];
-    output_data[i] =
-        ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+    const std::function<float(const float &, const float &)> fn =
+        [](const float &a, const float &b) -> float { return a - b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
   }
 }
 
@@ -516,60 +673,6 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
   }
 }
 
-inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
-                           const float *input1_data, const float *input2_data, float *output_data)
-{
-  int i = 0;
-
-#ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(params.float_activation_min);
-  const auto activation_max = vdupq_n_f32(params.float_activation_max);
-  for (; i <= size - 16; i += 16)
-  {
-    auto a10 = vld1q_f32(input1_data + i);
-    auto a11 = vld1q_f32(input1_data + i + 4);
-    auto a12 = vld1q_f32(input1_data + i + 8);
-    auto a13 = vld1q_f32(input1_data + i + 12);
-    auto a20 = vld1q_f32(input2_data + i);
-    auto a21 = vld1q_f32(input2_data + i + 4);
-    auto a22 = vld1q_f32(input2_data + i + 8);
-    auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vmulq_f32(a10, a20);
-    auto x1 = vmulq_f32(a11, a21);
-    auto x2 = vmulq_f32(a12, a22);
-    auto x3 = vmulq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
-    vst1q_f32(output_data + i, x0);
-    vst1q_f32(output_data + i + 4, x1);
-    vst1q_f32(output_data + i + 8, x2);
-    vst1q_f32(output_data + i + 12, x3);
-  }
-  for (; i <= size - 4; i += 4)
-  {
-    auto a1 = vld1q_f32(input1_data + i);
-    auto a2 = vld1q_f32(input2_data + i);
-    auto x = vmulq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
-  }
-#endif // NEON
-
-  for (; i < size; i++)
-  {
-    auto x = input1_data[i] * input2_data[i];
-    output_data[i] =
-        ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
-  }
-}
-
 inline void MulQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
                       const uint8_t *input1_data, const Shape &input2_shape,
                       const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
@@ -583,7 +686,8 @@ inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape
                 const Shape &output_shape, float *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
@@ -599,38 +703,6 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-// Broadcast mul that can often be used for inner loop of broadcast Mul.
-// This function will handle scalar_value (LHS) * vector_values (RHS).
-// Since it's a float function, input params does not matter here.
-inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
-                               const float broadcast_value, const float *input2_data,
-                               float *output_data)
-{
-  int i = 0;
-#ifdef USE_NEON
-  const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min);
-  const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max);
-  const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
-  for (; i <= size - 4; i += 4)
-  {
-    const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
-
-    const float32x4_t output = vmulq_f32(input2_val_original, broadcast_value_dup);
-
-    const float32x4_t clamped =
-        vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output));
-    vst1q_f32(output_data + i, clamped);
-  }
-#endif // NEON
-
-  for (; i < size; ++i)
-  {
-    float x = broadcast_value * input2_data[i];
-    output_data[i] =
-        ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
-  }
-}
-
 inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
                                        const Shape &input1_shape, const uint8_t *input1_data,
                                        const Shape &input2_shape, const uint8_t *input2_data,
@@ -649,7 +721,8 @@ inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
     return;
   }
   BinaryBroadcastFiveFold(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
       static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
                            uint8_t *)>(MulElementwiseQuant8),
       static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
@@ -670,12 +743,56 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
                                                input2_data, output_shape, output_data, fn);
     return;
   }
-  BinaryBroadcastFiveFold(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *,
-                           float *)>(MulElementwise),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>(
-          MulSimpleBroadcast));
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+  BinaryBroadcastFiveFold(params, params.broadcast_category ==
+                                      BroadcastableOpCategory::kSecondInputBroadcastsFast,
+                          input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                          output_data, implFuncs.first, implFuncs.second);
+}
+
+inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                const float *input1_data, const Shape &input2_shape, const float *input2_data,
+                const Shape &output_shape, float *output_data)
+{
+#ifdef __aarch64__
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+#else
+  const std::function<float(const float &, const float &)> fn =
+      [](const float &a, const float &b) -> float { return a / b; };
+  reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
+                                output_shape, output_data, fn);
+#endif // __aarch64__
+}
+
+inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+#ifdef __aarch64__
+  if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
+  {
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+    BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
+  {
+    auto implFuncs =
+        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
+    BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else
+#endif // __aarch64__
+  {
+    const std::function<float(const float &, const float &)> fn =
+        [](const float &a, const float &b) -> float { return a / b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+  }
 }
 
 } // namespace optimized
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 9612dd517..7b4ff2040 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -19,7 +19,9 @@
 #define __NNFW_CKER_RUY_RUY_SUPPORT_H__
 
 #include <util/ConfigSource.h>
-#include <ruy/context.h>
+#include <ruy/matrix.h>
+#include <ruy/ruy.h>
+#include <cassert>
 #include "cker/Types.h"
 
 namespace nnfw
@@ -29,44 +31,54 @@ namespace cker
 namespace ruy_support
 {
 
-template <typename Scalar, typename DataPointer>
-void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
-                   ruy::Matrix<Scalar> *dst)
+inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy)
 {
-  dst->layout.rows = params.rows;
-  dst->layout.cols = params.cols;
-  if (params.order == Order::kColMajor)
-  {
-    dst->layout.order = ruy::Order::kColMajor;
-    dst->layout.stride = params.rows;
-  }
-  else
+  switch (cache_policy)
   {
-    dst->layout.order = ruy::Order::kRowMajor;
-    dst->layout.stride = params.cols;
+    case CachePolicy::kNeverCache:
+      return ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ruy::CachePolicy::kAlwaysCache;
+    default:
+      assert(false);
+      return ruy::CachePolicy::kNeverCache;
   }
+}
+
+template <typename Scalar, typename DataPointer>
+void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
+                   ruy::Matrix<Scalar> *dst, bool use_caching = false)
+{
+  ruy::Order ruy_order =
+      params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
+  ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
   // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
   // It does care whether we assign to it a Scalar* or a const Scalar*.
-  dst->data = data_ptr;
-  dst->zero_point = params.zero_point;
-  dst->cacheable = params.cacheable;
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+  if (use_caching)
+  {
+    dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
+  }
 }
 
 template <typename GemmParamsType, typename RuySpecType>
-void MakeRuySpec(const GemmParamsType &params, RuySpecType *ruy_spec)
+void MakeRuyMulParams(const GemmParamsType &params, RuySpecType *ruy_mul_params)
 {
   // This validation has already been performed by the Gemm API entry point,
   // but it doesn't hurt to test specifically this again here, where it's
   // being used.
   ValidateGemmParams(params);
 
-  ruy_spec->multiplier_fixedpoint = params.multiplier_fixedpoint;
-  ruy_spec->multiplier_exponent = params.multiplier_exponent;
-  ruy_spec->multiplier_fixedpoint_perchannel = params.multiplier_fixedpoint_perchannel;
-  ruy_spec->multiplier_exponent_perchannel = params.multiplier_exponent_perchannel;
-  ruy_spec->bias = params.bias;
-  ruy_spec->clamp_min = params.clamp_min;
-  ruy_spec->clamp_max = params.clamp_max;
+  ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+  ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+  ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel);
+  ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel);
+  ruy_mul_params->set_bias(params.bias);
+  ruy_mul_params->set_clamp_min(params.clamp_min);
+  ruy_mul_params->set_clamp_max(params.clamp_max);
 }
 
 } // namespace ruy_support
diff --git a/docs/conf.py b/docs/conf.py
index 9b870097a..34d7c6431 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,7 @@ copyright = '2020, Samsung Research & contributors'
 author = 'Samsung Research & contributors'
 
 # The full version, including alpha/beta/rc tags
-release = '1.9.0'
+release = '1.10.0'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/docs/howto/how-to-build-compiler.md b/docs/howto/how-to-build-compiler.md
index e9dcb263a..75699890a 100644
--- a/docs/howto/how-to-build-compiler.md
+++ b/docs/howto/how-to-build-compiler.md
@@ -1,7 +1,124 @@
 # How to Build Compiler
 
+This document is based on the system where Ubuntu Desktop Linux 18.04 LTS is installed with default
+settings, and can be applied in other environments without much difference. For reference, the
+development of our project started in the Ubuntu Desktop Linux 16.04 LTS environment.
+As of now, to build in 16.04, please use gcc 7.x or above.
+
 ## Build Requires
 
+If you are going to build this project, the following modules must be installed on your system:
+
+- CMake
+- Boost C++ libraries
+
+In the Ubuntu, you can easily install it with the following command.
+
+```
+$ sudo apt-get install cmake libboost-all-dev
+```
+
+If your linux system does not have the basic development configuration, you will need to install
+more packages. A list of all packages needed to configure the development environment can be found
+in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file.
+
+Here is a summary of it
+
+```
+$ sudo apt-get install \
+build-essential \
+clang-format-3.9 \
+cmake \
+doxygen \
+git \
+hdf5-tools \
+lcov \
+libatlas-base-dev \
+libboost-all-dev \
+libgflags-dev \
+libgoogle-glog-dev \
+libgtest-dev \
+libhdf5-dev \
+libprotobuf-dev \
+protobuf-compiler \
+pylint \
+python3 \
+python3-pip \
+python3-venv \
+scons \
+software-properties-common \
+unzip \
+wget
+
+$ mkdir /tmp/gtest
+$ cd /tmp/gtest
+$ cmake /usr/src/gtest
+$ make
+$ sudo mv *.a /usr/lib
+
+$ pip install yapf==0.22.0 numpy
+```
+
 ## Build for Ubuntu
 
-## Build for windows
+In a typical linux development environment, including Ubuntu, you can build the compiler with a
+simple command like this:
+
+```
+$ git clone https://github.com/Samsung/ONE.git one
+$ cd one
+$ ./nncc configure
+$ ./nncc build
+```
+Build artifacts will be placed in `build` folder.
+
+To run unit tests:
+```
+$ ./nncc test
+```
+
+Above steps will build all the modules in the compiler folder. There are modules that are currently
+not active. To build only as of now active modules of the compiler, we provide a preset of modules
+to build with below command:
+```
+$ ./nnas create-package --prefix $HOME/.local
+```
+
+With this command, `~/.local` folder will contain all files in release.
+If you have added `~/.local/bin` in PATH, then you will now have latest compiler binaries.
+
+### Build for debug and release separately
+
+Build target folder can be customized by `NNCC_WORKSPACE` environment, as we may want to separate
+debug and release builds.
+
+```
+$ NNCC_WORKSPACE=build/debug ./nncc configure
+$ ./nncc build
+```
+will build debug version in `build/debug` folder, and
+
+```
+$ NNCC_WORKSPACE=build/release ./nncc configure -DCMAKE_BUILD_TYPE=Release
+$ ./nncc build
+```
+will build release version in `build/release` folder.
+
+### Trouble shooting
+
+If you are using python3.8, as there is no TensorFlow1.13.2 package for python3.8, build may fail.
+Please install python3.7 or lower versions as default python3.
+
+## Build for Windows
+
+To build for Windows, we use MinGW(Minimalist GNU for Windows). [Here](https://github.com/git-for-windows/build-extra/releases) you can download a tool that includes it.
+
+```
+$ git clone https://github.com/Samsung/ONE.git one
+$ cd one
+$ NNAS_BUILD_PREFIX=build ./nnas create-package --preset 20200731_windows --prefix install
+```
+
+- `NNAS_BUILD_PREFIX` is the path to directory where compiler-build-artifacts will be stored.
+- `--preset` is the one that specifies a version you will install. You can see `infra/packaging/preset/` directory for more details and getting latest version.
+- `--prefix` is the install directory.
diff --git a/docs/howto/how-to-build-package.md b/docs/howto/how-to-build-package.md
index 65cc3c753..803466bfe 100644
--- a/docs/howto/how-to-build-package.md
+++ b/docs/howto/how-to-build-package.md
@@ -1 +1,188 @@
 # How to Build Package
+
+## Overview
+
+This document describes how to build a Package to run the model in our runtime
+_onert_ that consists of model and additional file(s). Users can build a
+package through command line tools.
+
+Steps of building a Package:
+1. Import model and convert to circle
+1. Optionally, optimize and quantize circle
+1. Create package from circle
+
+NOTE: Examples and options of each command shown below are from the version of
+writing this document. They may differ from latest version of commands, 1.9.0.
+Please fire an issue or post a PR to correct them if anything needs update.
+
+## Import model
+
+Currently TensorFlow and TensorFlow lite models are supported as of writing
+this document.
+
+To import a model, use `one-import` with a model framework key and arguments.
+```
+$ one-import FRAMEWORK [arguments]
+```
+
+Execute `one-import` without any key will show the list of supported frameworks.
+
+Example of `one-import` command:
+```
+$ one-import
+Usage: one-import [FRAMEWORK] ...
+Available FRAMEWORK drivers:
+  bcq
+  tf
+  tflite
+```
+
+### Example for TensorFlow
+
+This is an example to import TensorFlow model:
+```
+$ one-import tf --input_path mymodel.pb --output_path mymodel.circle \
+--input_arrays input1,input2 --input_shapes "1,224,224,3:1000" \
+--output_arrays output
+```
+
+Running with `--help` will show current required/optional arguments:
+```
+$ one-import tf --help
+Convert TensorFlow model to circle.
+Usage: one-import-tf
+    --version Show version information and exit
+    --input_path <path/to/tfmodel>
+    --output_path <path/to/circle>
+    --input_arrays <names of the input arrays, comma-separated>
+    --input_shapes <input shapes, colon-separated>
+    --output_arrays <names of the output arrays, comma-separated>
+    --v2 Use TensorFlow 2.x interface (default is 1.x interface)
+```
+
+### Example for TensorFlow lite
+
+This is an example to import TensorFlow lite model:
+```
+$ one-import tflite --input_path mymodel.tflite --output_path mymodel.circle
+```
+
+Likewise, running with `--help` will show current required/optional arguments:
+```
+$ one-import tflite --help
+Convert TensorFlow lite model to circle.
+Usage: one-import-tflite
+    --version Show version information and exit
+    --input_path <path/to/tflitemodel>
+    --output_path <path/to/circle>
+```
+
+### Example for TensorFlow Model Including BCQ Information
+
+This is an example to import TensorFlow model which includes BCQ information.
+As a result of this command, BCQ information nodes will be preserved.
+```
+$ one-import bcq --input_path bcqmodel.pb --output_path bcqmodel.circle
+```
+
+Likewise, running with `--help` will show current required/optional arguments:
+```
+$ one-import bcq --help
+Convert TensorFlow model with BCQ to circle.
+Usage: one-import-bcq
+    --version Show version information and exit
+    --input_path <path/to/tfmodel/with/BCQ>
+    --output_path <path/to/circle>
+    --input_arrays <names of the input arrays, comma-separated>
+    --input_shapes <input shapes, colon-separated>
+    --output_arrays <names of the output arrays, comma-separated>
+    --v2 Use TensorFlow 2.x interface (default is 1.x interface)
+```
+
+## Optimize circle model
+
+circle model can be optimized for better performance and smaller size.
+Typical optimization algorithm for this is to fuse some patterns of operators
+to one fused operator.
+
+This is an example to optimize circle model:
+```
+$ one-optimize --all --input_path mymodel.circle --output_path optmodel.circle
+```
+
+Run with `--help` will show current optimization options:
+```
+$ one-optimize --help
+Optimize circle model.
+Usage: one-optimize
+    --version       Show version information and exit
+    --all           Enable all optimization algorithms
+    --fuse_bcq      Enable FuseBCQ Pass
+    --fuse_instnorm Enable FuseInstanceNormalization Pass
+    --resolve_customop_add
+                    Enable ResolveCustomOpAddPass Pass
+    --resolve_customop_batchmatmul
+                    Enable ResolveCustomOpBatchMatMulPass Pass
+    --resolve_customop_matmul
+                    Enable ResolveCustomOpMatMulPass Pass
+    --input_path <path/to/input/circle>
+    --output_path <path/to/output/circle>
+```
+
+## Quantize circle model
+
+Floating-point circle model can be quantized to lower-precision format
+(e.g., uint8 or int16) for faster inference speed and smaller model size,
+by reducing the number of bits that represent weights and activations.
+
+This is an example to quantize circle model:
+```
+$ one-quantize --input_path mymodel.circle --output_path quantmodel.circle
+```
+
+Like wise, `--help` will show current quantization options:
+```
+$ one-quantize --help
+Quantize circle model.
+Usage: one-quantize
+    --version         Show version information and exit
+    --input_dtype     Input data type (supported: float32, default=float32)
+    --quantized_dtype Output quantized data type (supported: uint8, default=uint8)
+    --granularity     Quantize granularity (supported: layer, channel, default=layer)
+    --min_percentile  Minimum percentile (0.0~100.0, default=1.0)
+    --max_percentile  Maximum percentile (0.0~100.0, default=99.0)
+    --mode            Record mode (supported: percentile/moving_average, default=percentile)
+    --input_path <path/to/input/circle>
+    --input_data <path/to/input/data>
+    --output_path <path/to/output/circle>
+```
+
+## Pack circle model
+
+Use `one-pack` command to create package.
+
+```
+$ one-pack -i mymodel.circle -o nnpackage
+```
+
+`nnpackage` is a folder containing circle model and addition file(s)
+
+```
+$ tree nnpackage
+nnpackage
+└── mymodel
+    ├── metadata
+    │   └── MANIFEST
+    └── mymodel.circle
+```
+
+Likewise, `--help` will show current package options:
+
+```
+$ one-pack --help
+Package circle to nnpkg
+Usage: one-pack
+    -v, --version Show version information and exit
+    -i <path/to/circle>
+    -o <path/to/nnpackage/folder>
+```
diff --git a/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md b/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md
new file mode 100644
index 000000000..fd5f1349f
--- /dev/null
+++ b/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md
@@ -0,0 +1,324 @@
+# How to Build Runtime with GBS for Tizen/RPi4
+
+This document describes how to build runtime with GBS for Tizen AARCH64.
+As a real example, we'll also describe how to prepare Tizen on Raspberry Pi 4
+and show you how to run our test package runner `nnpackage_run`.
+
+For ARM32, there would be not much difference with some changes.
+
+Host PC is Ubuntu 18.04 but other versions or distro may work with a little
+adjustments.
+
+Detailed technical informations are not described here so please read referecnce
+pages while you go on.
+
+
+## Setting up build environment
+
+(1) Add Tizen build tools repo
+```
+$ sudo vim /etc/apt/sources.list
+```
+Add this at the end
+```
+deb [trusted=yes] http://download.tizen.org/tools/latest-release/Ubuntu_18.04/ /
+```
+Note: There's a slash('/`) at the end.
+
+For other versions of Ubuntu, please refer
+http://download.tizen.org/tools/latest-release/ lists.
+
+(2) Update package informations and upgrade to latest
+```
+$ sudo apt-get update
+$ sudo apt-get upgrade
+```
+
+(3) Install GBS tools
+```
+$ sudo apt-get install gbs mic
+```
+
+To get more informations, please refer [HERE](https://source.tizen.org/ko/documentation/developer-guide/getting-started-guide/installing-development-tools)
+
+## Build ONERT
+
+(1) Set `python2` as default python
+
+Some tools of GBS run in `python2` and won't run with `python3`.
+Please check `python` version and set it to 2.x.
+
+(2) set `TIZEN_BUILD_ROOT`
+
+You may set `GBS-ROOT` to any place you like. Ususally we use home folder.
+```
+$ export TIZEN_BUILD_ROOT=$HOME/GBS-ROOT/
+```
+Adding to `$HOME/.profile` file would be a good thing.
+
+(3) clone ONE repo
+
+```
+git clone https://github.com/Samsung/ONE.git
+```
+
+(4) Build
+
+```
+$ cd ONE
+
+$ gbs -c infra/nnfw/config/gbs.conf build --include-all -A aarch64 --define 'test_build 1'
+```
+- `-A aarch64` is to set architecture to AARCH64. Use `arm32` for ARM32 target.
+- `--define 'test_build 1'` is to enable test build so that we can use `nnpackage_run`
+
+Now take a cup of coffee.
+
+(5) Build result RPM packages
+
+```
+$ ls ~/GBS-ROOT/local/repos/tizen/aarch64/RPMS
+nnfw-1.10.0-1.aarch64.rpm
+nnfw-debuginfo-1.10.0-1.aarch64.rpm
+nnfw-debugsource-1.10.0-1.aarch64.rpm
+nnfw-devel-1.10.0-1.aarch64.rpm
+nnfw-minimal-app-1.10.0-1.aarch64.rpm
+nnfw-minimal-app-debuginfo-1.10.0-1.aarch64.rpm
+nnfw-plugin-devel-1.10.0-1.aarch64.rpm
+nnfw-test-1.10.0-1.aarch64.rpm
+nnfw-test-debuginfo-1.10.0-1.aarch64.rpm
+```
+
+`-1.10.0-1` may differ as this document was written with under `1.10.0` development.
+
+## Prepare Tizen on Raspberry Pi 4
+
+Please refer https://wiki.tizen.org/Quick_guide_for_RPI4 for detailed descriptions.
+
+(1) Download flashing tool
+```
+$ wget \
+https://git.tizen.org/cgit/platform/kernel/u-boot/plain/scripts/tizen/sd_fusing_rpi3.sh?h=tizen \
+--output-document=sd_fusing_rpi3.sh
+
+$ chmod 755 sd_fusing_rpi3.sh
+```
+
+(2) Prepare Micro-SD memory card.
+
+You first need to find out device name. This document will skip how to find this.
+Suppose it's `/dev/sdj`:
+```
+$ sudo ./sd_fusing_rpi3.sh -d /dev/sdj --format
+```
+You need to change `/dev/sdj` to your configuration.
+
+Partition table may look like this
+```
+Device     Boot    Start      End  Sectors  Size Id Type
+/dev/sdj1  *        8192   139263   131072   64M  e W95 FAT16 (LBA)
+/dev/sdj2         139264  6430719  6291456    3G 83 Linux
+/dev/sdj3        6430720  9183231  2752512  1.3G 83 Linux
+/dev/sdj4        9183232 62521343 53338112 25.4G  5 Extended
+/dev/sdj5        9185280 61958143 52772864 25.2G 83 Linux
+/dev/sdj6       61960192 62025727    65536   32M 83 Linux
+/dev/sdj7       62027776 62044159    16384    8M 83 Linux
+/dev/sdj8       62046208 62111743    65536   32M 83 Linux
+/dev/sdj9       62113792 62130175    16384    8M 83 Linux
+/dev/sdj10      62132224 62263295   131072   64M 83 Linux
+/dev/sdj11      62265344 62521343   256000  125M 83 Linux
+```
+
+(3) Download images
+
+Please visit http://download.tizen.org/snapshots/tizen/unified/latest/images/standard/iot-boot-arm64-rpi4/
+and http://download.tizen.org/snapshots/tizen/unified/latest/images/standard/iot-headed-3parts-aarch64-rpi.
+
+Please visit `iot-boot-armv7l-rpi4` folder for ARM32 images.
+
+Get latest file. As of writing this document, name has `20200908.3`.
+```
+$ wget  http://download.tizen.org/snapshots/tizen/unified/latest/images/standard/iot-boot-arm64-rpi4/tizen-unified_20200908.3_iot-boot-arm64-rpi4.tar.gz
+
+$ wget http://download.tizen.org/snapshots/tizen/unified/latest/images/standard/iot-headed-3parts-aarch64-rpi/tizen-unified_20200908.3_iot-headed-3parts-aarch64-rpi.tar.gz
+```
+
+(4) Flash images to memory card
+
+As like above, suppose memory card is at `/dev/sdj`
+```
+$ sudo ./sd_fusing_rpi3.sh -d /dev/sdj \
+-b tizen-unified_20200908.3_iot-boot-arm64-rpi4.tar.gz \
+tizen-unified_20200908.3_iot-headed-3parts-aarch64-rpi.tar.gz
+```
+You need to change `/dev/sdj` to your configuration and also `tizen-unified_...` file to your
+latest download file name.
+
+(5) Assign IP address for `sdb` connection
+
+Here, we provide a way to connect `sdb` tool through TCP/IP.
+
+Below steps will modify root image and set fixed IP address.
+
+(5-1) Mount image to host
+```
+$ mkdir j2
+$ sudo mount /dev/sdj2 j2
+```
+As like above, please update `/dev/sdj2` to your configuration.
+
+(5-2) Add a new file
+```
+$ vi j2/etc/systemd/system/ip.service
+```
+and set as like:
+```
+[Service]
+Type=simple
+Restart=always
+RestartSec=1
+User=root
+ExecStart=/bin/sh /bin/ip.sh
+
+[Install]
+WantedBy=multi-user.target
+```
+
+(5-3) Add a new file
+```
+$ vi j2/bin/ip.sh
+```
+and set with IP address for your RPi4:
+```
+ifconfig eth0 192.168.x.y netmask 255.255.255.0 up
+```
+where you should update `192.168.x.y` part to your actual IP address.
+
+(5-4) Add a symbolic link
+```
+$ pushd j2/etc/systemd/system/multi-user.target.wants/
+$ sudo ln -s ../../system/ip.service .
+$ popd
+```
+
+(5-5) Now that every thing is ready, unmount and unplug your memory card and plug into
+RPi4, turn on the power.
+```
+$ sync
+$ sudo umount j2
+```
+
+## sdb connect to Tizen/RPi4
+
+You may need to install Tizen Studio to use `sdb` command.
+Please visit https://developer.tizen.org/ if you don't have this.
+
+We assume `sdb` command is in the PATH.
+
+(1) Connect
+
+```
+$ sdb connect 192.168.x.y
+connecting to 192.168.x.y:26101 ...
+connected to 192.168.x.y:26101
+```
+Please update `192.168.x.y` part to your actual IP address.
+
+Check with `devices` command: you should see `rpi3` or alike.
+```
+$ sdb devices
+List of devices attached
+192.168.x.y:26101     device          rpi3
+```
+
+(2) Remount filesystem with R/W
+
+You need to remount file system with Read/Write so that you can install packages.
+```
+$ sdb root on
+$ sdb shell
+```
+Inside your Tizen/RPi4:
+```
+sh-3.2# mount -o rw,remount /
+```
+
+(3) Download dependent packages
+
+In your host, maybe with another terminal, download packages from
+http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/
+
+```
+$ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/libarmcl-v20.05-17.5.aarch64.rpm
+
+$ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/libhdf5-101-1.10.1-3.85.aarch64.rpm
+
+$ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/libhdf5_cpp101-1.10.1-3.85.aarch64.rpm
+```
+
+(4) Copy to device
+```
+$ sdb push libarmcl-v20.05-17.5.aarch64.rpm /opt/usr/home/owner/share/tmp/
+$ sdb push libhdf5-101-1.10.1-3.85.aarch64.rpm /opt/usr/home/owner/share/tmp/
+$ sdb push libhdf5_cpp101-1.10.1-3.85.aarch64.rpm /opt/usr/home/owner/share/tmp/
+```
+And our runtime packages
+```
+$ cd ~/GBS-ROOT/local/repos/tizen/aarch64/RPMS
+$ sdb push nnfw-1.10.0-1.aarch64.rpm /opt/usr/home/owner/share/tmp/
+$ sdb push nnfw-test-1.10.0-1.aarch64.rpm /opt/usr/home/owner/share/tmp/
+```
+
+(5) Install dependent packages
+
+Within Tizen/RPi4 shell
+```
+sh-3.2# cd /opt/usr/home/owner/share/tmp/
+
+sh-3.2# rpm -i libarmcl-v20.05-17.5.aarch64.rpm
+sh-3.2# rpm -i libhdf5-101-1.10.1-3.85.aarch64.rpm
+sh-3.2# rpm -i libhdf5_cpp101-1.10.1-3.85.aarch64.rpm
+```
+There may be message like this but it seems OK:
+```
+/sbin/ldconfig: Cannot lstat /lib64/libhdf5.so.101.0.0: Permission denied
+```
+Continue install
+```
+sh-3.2# rpm -i nnfw-1.10.0-1.aarch64.rpm
+sh-3.2# rpm -i nnfw-test-1.10.0-1.aarch64.rpm
+```
+
+Our `Product` binary folder is installed at `/opt/usr/nnfw-test`.
+```
+sh-3.2# cd /opt/usr/nnfw-test
+sh-3.2# ls -al
+total 16
+drwxr-xr-x  4 root root 4096 Jan  1 09:05 .
+drwxr-xr-x 14 root root 4096 Jan  1 09:05 ..
+drwxr-xr-x  3 root root 4096 Jan  1 09:05 Product
+drwxr-xr-x  3 root root 4096 Jan  1 09:05 infra
+```
+
+(6) Run nnpackage
+
+Refer `how-to-build-package.md` document to produce nnpackage from a model.
+
+Assume `mobilenet_v2_1.4_224` nnpackage is already copied to
+`/opt/usr/home/owner/media/models` folder with `sdb` command.
+
+```
+sh-3.2# BACKENDS="cpu" Product/out/bin/nnpackage_run \
+--nnpackage /opt/usr/home/owner/media/models/mobilenet_v2_1.4_224
+
+Package Filename /opt/usr/home/owner/media/models/mobilenet_v2_1.4_224
+===================================
+MODEL_LOAD   takes 65.403 ms
+PREPARE      takes 158.716 ms
+EXECUTE      takes 373.447 ms
+- MEAN     :  373.447 ms
+- MAX      :  373.447 ms
+- MIN      :  373.447 ms
+- GEOMEAN  :  373.447 ms
+===================================
+```
diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
index f4751198e..9181a6138 100644
--- a/docs/howto/how-to-build-runtime.md
+++ b/docs/howto/how-to-build-runtime.md
@@ -70,13 +70,13 @@ Unfortunately, the debug build on the x86_64 architecture currently has an error
 
 ```
 $ export BUILD_TYPE=release
-$ make install
+$ make -f Makefile.template install
 ```
 
 Or you can simply do something like this:
 
 ```
-$ BUILD_TYPE=release make install
+$ BUILD_TYPE=release make -f Makefile.template install
 ```
 
 The build method described here is a `native build` in which the build environment and execution environment are same. So, this command creates a runtime binary targeting the current build architecture, probably x86_64, as the execution environment. You can find the build output in the ./Product folder as follows:
@@ -84,9 +84,9 @@ The build method described here is a `native build` in which the build environme
 ```
 $ tree -L 2 ./Product
 ./Product
-├── obj -> /home/sjlee/star/one/Product/x86_64-linux.debug/obj
-├── out -> /home/sjlee/star/one/Product/x86_64-linux.debug/out
-└── x86_64-linux.debug
+├── obj -> /home/sjlee/star/one/Product/x86_64-linux.release/obj
+├── out -> /home/sjlee/star/one/Product/x86_64-linux.release/out
+└── x86_64-linux.release
     ├── BUILD
     ├── CONFIGURE
     ├── INSTALL
@@ -104,56 +104,68 @@ $ tree -L 3 ./Product/out
 │   └── tflite_run
 ├── include
 │   ├── nnfw
+│   │   ├── NeuralNetworks.h
 │   │   ├── NeuralNetworksEx.h
 │   │   ├── NeuralNetworksExtensions.h
-│   │   ├── NeuralNetworks.h
-│   │   ├── nnfw_experimental.h
-│   │   └── nnfw.h
+│   │   ├── nnfw.h
+│   │   └── nnfw_experimental.h
 │   └── onert
 │       ├── backend
 │       ├── compiler
 │       ├── exec
 │       ├── ir
-│       ├── misc
 │       └── util
 ├── lib
 │   ├── libbackend_cpu.so
 │   ├── libcircle_loader.so
 │   ├── libneuralnetworks.so
 │   ├── libnnfw-dev.so
-│   ├── libnnfw_lib_benchmark.so
-│   ├── libnnfw_lib_misc.a
 │   ├── libonert_core.so
 │   └── libtflite_loader.so
-├── tests
+├── test
 │   ├── FillFrom_runner
+│   ├── command
+│   │   ├── nnpkg-test
+│   │   ├── prepare-model
+│   │   ├── unittest
+│   │   └── verify-tflite
+│   ├── list
+│   │   ├── benchmark_nnpkg_model_list.txt
+│   │   ├── frameworktest_list.aarch64.acl_cl.txt
+│   │   ├── frameworktest_list.aarch64.acl_neon.txt
+│   │   ├── frameworktest_list.aarch64.cpu.txt
+│   │   ├── frameworktest_list.armv7l.acl_cl.txt
+│   │   ├── frameworktest_list.armv7l.acl_neon.txt
+│   │   ├── frameworktest_list.armv7l.cpu.txt
+│   │   ├── frameworktest_list.noarch.interp.txt
+│   │   ├── frameworktest_list.x86_64.cpu.txt
+│   │   ├── nnpkg_test_list.armv7l-linux.acl_cl
+│   │   ├── nnpkg_test_list.armv7l-linux.acl_neon
+│   │   ├── nnpkg_test_list.armv7l-linux.cpu
+│   │   ├── nnpkg_test_list.noarch.interp
+│   │   ├── tflite_loader_list.aarch64.txt
+│   │   └── tflite_loader_list.armv7l.txt
+│   ├── models
+│   │   ├── nnfw_api_gtest
+│   │   ├── run_test.sh
+│   │   └── tflite
 │   ├── nnpkgs
 │   │   └── FillFrom
-│   └── scripts
-│       ├── benchmark_nnapi.sh
-│       ├── benchmark_nnpkg.sh
-│       ├── common.sh
-│       ├── framework
-│       ├── list
-│       ├── print_to_json.sh
-│       ├── test-driver.sh
-│       ├── test_framework.sh
-│       ├── test_scheduler_with_profiling.sh
-│       └── unittest.sh
+│   └── onert-test
 ├── unittest
 │   ├── nnapi_gtest
 │   ├── nnapi_gtest.skip
 │   ├── nnapi_gtest.skip.noarch.interp
-│   ├── nnapi_gtest.skip.x86_64-linux.cpu
-│   ├── test_compute
-│   ├── test_onert
-│   ├── test_onert_backend_cpu_common
-│   ├── test_onert_frontend_nnapi
-│   └── tflite_test
+│   └── nnapi_gtest.skip.x86_64-linux.cpu
 └── unittest_standalone
-    └── nnfw_api_gtest
+    ├── nnfw_api_gtest
+    ├── test_compute
+    ├── test_onert
+    ├── test_onert_backend_cpu_common
+    ├── test_onert_frontend_nnapi
+    └── tflite_test
 
-19 directories, 36 files
+20 directories, 47 files
 
 ```
 
@@ -173,25 +185,23 @@ inception_v3.tflite
 The result of running the inception_v3 model using runtime is as follows. Please consider that this is a test that simply checks execution latency without considering the accuracy of the model.
 
 ```
-$ USE_NNAPI=1 LD_LIBRARY_PATH="./Product/out/lib/:$LD_LIBRARY_PATH" ./Product/out
-/bin/tflite_run ./inception_v3.tflite
-nnapi function 'ANeuralNetworksModel_create' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksModel_addOperand' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksModel_setOperandValue' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksModel_addOperation' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksModel_identifyInputsAndOutputs' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksModel_finish' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksCompilation_create' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksCompilation_finish' is loaded from './Product/out/lib/libneuralnetworks.so'
+$ USE_NNAPI=1 ./Product/out/bin/tflite_run ./inception_v3.tflite
+nnapi function 'ANeuralNetworksModel_create' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksModel_addOperand' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksModel_setOperandValue' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksModel_addOperation' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksModel_identifyInputsAndOutputs' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksModel_finish' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksCompilation_create' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksCompilation_finish' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
 input tensor indices = [317,]
-nnapi function 'ANeuralNetworksExecution_create' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksExecution_setInput' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksExecution_setOutput' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksExecution_startCompute' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksEvent_wait' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksEvent_free' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksExecution_free' is loaded from './Product/out/lib/libneuralnetworks.so'
-... run 1 takes 183.895 ms
+nnapi function 'ANeuralNetworksExecution_create' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksExecution_setInput' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksExecution_setOutput' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksExecution_startCompute' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksEvent_wait' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksEvent_free' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksExecution_free' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
 output tensor indices = [316(max:905),]
 ===================================
 MODEL_LOAD   takes 1.108 ms
@@ -202,10 +212,10 @@ EXECUTE      takes 183.895 ms
 - MIN      :  183.895 ms
 - GEOMEAN  :  183.895 ms
 ===================================
-nnapi function 'ANeuralNetworksCompilation_free' is loaded from './Product/out/lib/libneuralnetworks.so'
-nnapi function 'ANeuralNetworksModel_free' is loaded from './Product/out/lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksCompilation_free' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
+nnapi function 'ANeuralNetworksModel_free' is loaded from '/home/sjlee/star/one/Product/x86_64-linux.release/out/bin/../lib/libneuralnetworks.so'
 ```
-Here, `USE_NNAPI=1` means that **ONE** runtime is used for model inference. If omitted, the model will be executed using Tensorflow lite, the basic framework for verification. `LD_LIBRARY_PATH="./Product/out/lib/:$LD_LIBRARY_PATH"` specifies the location of the runtime library to be used for testing. From the previous build result, you can see that it is the path to the directory where `libneuralnetworks.so` and `libonert_core.so` are located.
+Here, `USE_NNAPI=1` means that **ONE** runtime is used for model inference. If omitted, the model will be executed using Tensorflow lite, the basic framework for verification. From the previous build result, you can see that it is the path to the directory where `libneuralnetworks.so` and `libonert_core.so` are located.
 
 If you come here without any problems, you have all of the basic environments for runtime development.
 
diff --git a/docs/howto/how-to-cross-build-runtime-for-arm.md b/docs/howto/how-to-cross-build-runtime-for-arm.md
index c17c465bf..37f550ab3 100644
--- a/docs/howto/how-to-cross-build-runtime-for-arm.md
+++ b/docs/howto/how-to-cross-build-runtime-for-arm.md
@@ -14,7 +14,7 @@ Use `install_rootfs.sh` script to prepare Root File System. You should have `sud
 $ sudo ./tools/cross/install_rootfs.sh arm
 ```
 - supports `arm`(default) and `aarch` architecutre for now
-- supports `xenial`(default) `trusty`, and `bionic` release
+- supports `xenial`(default) `trusty`, `bionic`, and `focal` release
 
 To see the options,
 ```
@@ -73,10 +73,10 @@ $ echo 'export PATH=~/your/path/gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueab
 ```
 
 - On Ubuntu 18.04 LTS, you can install using `apt-get`.
-Choose g++ version whatever you prefer: 6, 7 or 8.
+Choose g++ version whatever you prefer: 6, 7, 8 or 9.
 
 ```
-$ sudo apt-get install g++-{6,7,8}-arm-linux-gnueabihf
+$ sudo apt-get install g++-{6,7,8,9}-arm-linux-gnueabihf
 ```
 
 Make sure you get `libstdc++.so` updated on your target with your new toolchain's corresponding one.
diff --git a/docs/howto/how-to-introduce-a-new-operation-into-runtime.md b/docs/howto/how-to-introduce-a-new-operation-into-runtime.md
index ab449c4be..583ba0eea 100644
--- a/docs/howto/how-to-introduce-a-new-operation-into-runtime.md
+++ b/docs/howto/how-to-introduce-a-new-operation-into-runtime.md
@@ -229,7 +229,7 @@ void DynamicShapeInferer::visit(const ir::operation::Select &op)
   ir::Shape new_shape =
       shape_inference::inferSelectShape(input_cond_shape, input_true_shape, input_false_shape);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
 }
 ```
 
diff --git a/docs/howto/index.rst b/docs/howto/index.rst
index d04224ed6..c84902a39 100644
--- a/docs/howto/index.rst
+++ b/docs/howto/index.rst
@@ -14,6 +14,7 @@ How To
   ./how-to-build-compiler.md
   ./how-to-build-package.md
   ./how-to-build-runtime.md
+  ./how-to-build-runtime-tizen-gbs-rpi4.md
   ./how-to-build-runtime-using-prebuilt-docker-image.md
   ./how-to-cross-build-runtime-for-arm.md
   ./how-to-cross-build-runtime-for-aarch64.md
diff --git a/docs/release/1.10/release-note-1.10.0.md b/docs/release/1.10/release-note-1.10.0.md
new file mode 100644
index 000000000..e6423be6f
--- /dev/null
+++ b/docs/release/1.10/release-note-1.10.0.md
@@ -0,0 +1,25 @@
+# Release Note 1.10.0
+
+## ONE Compiler
+
+### Compiler supports more operations
+
+- Dequantize,  UnidirectionalSequenceLSTM
+
+### Changes
+
+- New `--fold_dequantize` option in `one-optimize`
+- New `--fuse_add_with_tconv` option in `one-optimize`
+- Support `int16` quantization in `one-quantize`
+- Test scripts are added for basic testing of one-cmds command line tools
+- Bug fixes for one-cmds command line tools
+
+
+## ONE Runtime
+
+### Runtime backend operation support
+  - ACL-CL backend: OneHot
+  - CPU backend: FullyConnected for Float32 16x1 Block Sparsity
+
+### Optimization
+  - Speed up for ReduceSum, StrideSlice and BinaryArithmetic in CPU backend
diff --git a/docs/release/1.9/release-note-1.9.1.md b/docs/release/1.9/release-note-1.9.1.md
new file mode 100644
index 000000000..55290cfa1
--- /dev/null
+++ b/docs/release/1.9/release-note-1.9.1.md
@@ -0,0 +1,9 @@
+# Release Note 1.9.1
+
+## ONE Compiler
+
+### Changes
+
+- `tf2nnpkg` now supports to import TensorFlow model which includes BCQ information.
+- Minor change for preserving BCQ information.
+- Fix invalid input arguments and add more error handles for one-cmds
diff --git a/docs/runtime/api.md b/docs/runtime/api.md
index 3ff9ff056..9dacd2868 100644
--- a/docs/runtime/api.md
+++ b/docs/runtime/api.md
@@ -28,8 +28,6 @@ For usage, refer to [Howto : NNFW API](../howto/how-to-use-nnfw-api.md).
 
 ## Backend API
 
-Backend API is defined by One Runtime.
-
-Backend API is about actual computation of operations and memory management for operands. In order to allow different kinds of computation units or computation libraries, One Runtime defines Backend API to support user defined operation kernels and memory manager. It contains a lot of C++ headers which are subject to change.
+Backend API enables anyone to extend the runtime in terms of operation computation and memory management.
 
 For detailed descriptions, refer to [Backend API](../runtime/backend-api.md).
diff --git a/docs/runtime/backend-api.md b/docs/runtime/backend-api.md
index b291badcf..b32690a00 100644
--- a/docs/runtime/backend-api.md
+++ b/docs/runtime/backend-api.md
@@ -1 +1,64 @@
 # Backend API
+
+Backend API is defined by One Runtime. It is about actual computation of operations and memory management for operands. In order to allow different kinds of computation units or libraries, Backend API is exposed to support user defined operation kernels and memory manager. It contains several C++ interface classes which are **subject to change**.
+
+## How backends are loaded
+
+When a backend ID is given to a session, the compiler module tries to load `libbackend_{BACKEND_ID}.so`. If it is successful, the runtime looks up for C API functions in it, and make use of those.
+
+## C and C++ API
+
+### C API
+
+We have 2 C API functions which are used as the entrypoint and the exitpoint. Here are the definitions of those.
+
+```c
+onert::backend::Backend *onert_backend_create();
+void onert_backend_destroy(onert::backend::Backend *backend);
+```
+
+What they do is creating a C++ object and destroying it, respectively. These two functions are the only ones that are dynamically resolved at runtime.
+
+### C++ API
+
+> **NOTE** C++ API is subject to change so it may change in every release
+
+C API above is just an entrypoint and it delegates core stuff to C++ API.
+
+Here are major classes are described below. One must implement these classes(and some more classes) to create a backend.
+
+- `Backend` : Responsible to create a backend context which is a set of backend components
+- `IConfig` : Configurations and miscellaneous stuff
+- `ITensorRegistry` : A set of tensor(`ITensor`) objects that are used by the current backend
+- `ITensorBuilder` : Make tensor object and register it to `ITensorRegistry` and static tensors
+- `IKernelGenerator` : Generates operation kernels
+
+Please refer to each class document for details. You may refer to [Bundle Backends](#bundle-backends) for actual implementation samples.
+
+## Provided Backend Implementations
+
+We provide some backends along with the runtime. There is the special backend `controlflow` which is part of runtime core, and some bundle backends which are baseline backends and samples of backend implementation.
+
+## `controlflow` Backend
+
+`controlflow` is a special backend that is always loaded(statically linked, part of runtime core). It is implemented just like other backends, but there are some things that it does exclusively.
+
+- Has kernels for If, While and Permute operations (Kernels from other backends are never be used)
+- The runtime core directly creates `controlflow`'s tensor objects to accept user-given input and output buffers
+- The runtime core gives the executor context to `controlflow` backend which allows control flow ops can change execution flow properly
+
+## Bundle Backends
+
+Without actual implmentation of backends, we cannot run any models. So we provide 3 bundle backends which support dozens of operations.
+
+### cpu
+
+This backend is written in C++ and all the computation is done with CPU only.
+
+### acl_neon
+
+`acl_neon` is a backend that is an adaptation layer of [ARM ComputeLibrary](https://github.com/ARM-software/ComputeLibrary) NE(NEON) part. So it basically only uses CPU too, but worksonly on ARM.
+
+### acl_cl
+
+`acl_cl` is a backend that is an adaptation layer of [ARM ComputeLibrary](https://github.com/ARM-software/ComputeLibrary) CL(OpenCL) part. OpenCL support(`libOpenCL.so`) is also necessary in the running environment to be able to use this backend. Also, it works only on ARM.
diff --git a/docs/runtime/supported-operations-backend.md b/docs/runtime/supported-operations-backend.md
new file mode 100644
index 000000000..7255bf048
--- /dev/null
+++ b/docs/runtime/supported-operations-backend.md
@@ -0,0 +1,166 @@
+# Supported Operations and backend
+
+As of 2020-10-14
+
+### Raw-data format (float32, int32, boolean, etc)
+
+Operation | CPU | ACL-CL | ACL-NEON
+-- | -- | -- | --
+Abs | O | O | O
+Add | O | O | O
+ArgMax | O | O | O
+ArgMin | O |   |
+AvgPool2D | O | O | O
+BatchMatmul | O |   |
+BatchToSpaceND | O | O | O
+Cast | O | O | O
+Concat | O | O | O
+Conv2D | O | O | O
+Cos | O |   |
+Custom | O |   |
+DepthToSpace |   | O | O
+DepthwiseConv2D | O | O | O
+Div | O | O | O
+EmbeddingLookup |   | O | O
+Equal | O | O | O
+Exp | O | O | O
+ExpandDims | O |   |
+Fill | O |   |
+Floor |   | O | O
+FullyConnected | O | O | O
+Gather | O | O | O
+Greater | O | O | O
+GreaterEqual | O | O | O
+HashtableLookup |   | O | O
+If | O |   |
+InstanceNormalize |   | O | O
+L2Normalization | O | O | O
+L2Pool |   | O | O
+LeakyRelu |   | O | O
+Less | O | O | O
+LessEqual | O | O | O
+LocalResponseNormalize |   | O | O
+Log | O |   |
+LogicalAnd |   | O | O
+LogicalNot | O | O | O
+LogicalOr | O | O | O
+Logistic | O | O | O
+LogSoftmax | O |   |
+LSHProjection |   |   |
+LSTM |   | O | O
+Maximum | O | O | O
+MaxPool2D | O | O | O
+Mean | O | O | O
+Minimum | O | O | O
+Mul | O | O | O
+Neg | O | O | O
+NotEqual | O | O | O
+OneHot | O | O |
+Pack | O | O | O
+Pad | O | O | O
+PadV2 | O | O | O
+Pow | O |   |
+PReLU |   | O | O
+Quantize | O |   |
+Range | O |   |
+Rank | O |   |
+ReduceAny(Any) | O |   |
+ReduceMax(Max) | O | O | O
+ReduceMin(Min) | O | O | O
+ReduceProd | O |   |
+ReduceSum(Sum) | O | O | O
+ReLU | O | O | O
+ReLU6 |   | O | O
+Reshape | O | O | O
+ResizeBilinear | O | O | O
+ReverseV2 | O |   | O
+RNN |   | O | O
+Round | O |   |
+Rsqrt | O | O | O
+Select | O |   |
+SelectV2 | O |   |
+Shape | O |   |
+Sin | O |   |
+Slice | O | O | O
+Softmax | O | O | O
+SpaceToBatchND | O | O | O
+SpaceToDepth | O | O | O
+Split | O | O | O
+SplitV | O |   |
+Sqrt | O | O | O
+SquaredDifference | O | O | O
+Squeeze | O | O | O
+StridedSlice | O | O | O
+Sub | O | O | O
+Svdf |   |   |
+Tanh | O | O | O
+Tile | O |   |
+TopKV2 |   |   | O
+Transpose | O | O | O
+TransposeConv |   | O | O
+Unpack(Unstack) | O | O | O
+While | O |   |
+ZerosLike | O |   |
+
+### Quantization format (uint8 asymmetric)
+
+Operation | CPU | ACL-CL | ACL-NEON
+-- | -- | -- | --
+Add | O | O | O
+ArgMax | O | O | O
+ArgMin | O |   |
+AvgPool2D | O | O | O
+BatchToSpaceND | O | O | O
+Cast | O | O |
+Concat | O | O | O
+Conv2D | O | O | O
+Custom | O |   |
+DepthToSpace |   | O | O
+DepthwiseConv2D | O | O | O
+Dequantize |   | O | O
+EmbeddingLookup |   | O | O
+Equal | O | O | O
+ExpandDims | O |   |
+FullyConnected | O | O | O
+Gather | O | O | O
+Greater | O | O | O
+GreaterEqual | O | O | O
+HashtableLookup |   | O | O
+L2Normalization | O |   |
+Less | O | O | O
+LessEqual | O | O | O
+Logistic | O | O | O
+LogSoftmax | O |   |
+Maximum |   | O | O
+MaxPool2D | O | O | O
+Mean | O | O | O
+Minimum |   | O | O
+Mul | O | O |
+NotEqual | O | O | O
+OneHot |   | O |
+Pack |   | O | O
+Pad | O | O | O
+PadV2 | O | O | O
+PReLU |   | O | O
+ReduceMax(Max) |   | O |
+ReduceMin(Min) |   | O |
+ReduceSum(Sum) | O | O |
+ReLU |   | O | O
+ReLU6 |   | O | O
+Reshape | O | O | O
+ResizeBilinear | O |   | O
+Shape | O |   |
+Slice | O | O | O
+Softmax | O | O | O
+SpaceToBatchND | O | O | O
+SpaceToDepth | O | O | O
+Split | O | O | O
+SplitV | O |   |
+Squeeze | O | O | O
+StridedSlice |   | O | O
+Sub | O | O | O
+Tanh | O | O | O
+Tile | O |   |
+Transpose | O | O | O
+TransposeConv |   | O | O
+Unpack(Unstack) |   | O | O
diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
index adec1f91b..0ffa0cd35 100644
--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
+++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
@@ -12,7 +12,7 @@ function(_ARMComputeSource_import)
   ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
 
   set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
-  set(ARMComputeSource_FOUND ${ARMCOMPUTE_SOURCE_GET} PARENT_SCOPE)
+  set(ARMComputeSource_FOUND TRUE PARENT_SCOPE)
 endfunction(_ARMComputeSource_import)
 
 _ARMComputeSource_import()
diff --git a/infra/cmake/packages/BoostConfig.cmake b/infra/cmake/packages/BoostConfig.cmake
index c4d7d5857..e72f742f3 100644
--- a/infra/cmake/packages/BoostConfig.cmake
+++ b/infra/cmake/packages/BoostConfig.cmake
@@ -25,6 +25,17 @@ function(_Boost_Build Boost_PREFIX)
   list(APPEND Boost_Options --with-system)
   list(APPEND Boost_Options --with-filesystem)
 
+  if(DEFINED EXTERNALS_BUILD_THREADS)
+    set(N ${EXTERNALS_BUILD_THREADS})
+  else(DEFINED EXTERNALS_BUILD_THREADS)
+    include(ProcessorCount)
+    ProcessorCount(N)
+  endif(DEFINED EXTERNALS_BUILD_THREADS)
+
+  if((NOT N EQUAL 0) AND BUILD_EXT_MULTITHREAD)
+    list(APPEND Boost_Options -j${N})
+  endif()
+
   set(JAM_FILENAME ${BoostBuild_DIR}/user-config.jam)
 
   if(ANDROID)
diff --git a/infra/cmake/packages/BoostSourceConfig.cmake b/infra/cmake/packages/BoostSourceConfig.cmake
index 52cda7c7d..2477a4857 100644
--- a/infra/cmake/packages/BoostSourceConfig.cmake
+++ b/infra/cmake/packages/BoostSourceConfig.cmake
@@ -13,7 +13,7 @@ function(_BoostSource_import)
   ExternalSource_Download(BOOST ${BOOST_URL})
 
   set(BoostSource_DIR ${BOOST_SOURCE_DIR} PARENT_SCOPE)
-  set(BoostSource_FOUND ${BOOST_SOURCE_GET} PARENT_SCOPE)
+  set(BoostSource_FOUND TRUE PARENT_SCOPE)
 endfunction(_BoostSource_import)
 
 _BoostSource_import()
diff --git a/infra/cmake/packages/CpuInfoSourceConfig.cmake b/infra/cmake/packages/CpuInfoSourceConfig.cmake
new file mode 100644
index 000000000..60419ad9f
--- /dev/null
+++ b/infra/cmake/packages/CpuInfoSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_CpuInfoSource_import)
+  if(NOT ${DOWNLOAD_CPUINFO})
+    set(CpuInfoSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_CPUINFO})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  # CPUINFO commit including patch from tflite v2.3
+  envoption(CPUINFO_URL ${EXTERNAL_DOWNLOAD_SERVER}/pytorch/cpuinfo/archive/63b254577ed77a8004a9be6ac707f3dccc4e1fd9.tar.gz)
+  ExternalSource_Download(CPUINFO
+    DIRNAME CPUINFO
+    URL ${CPUINFO_URL})
+
+  set(CpuInfoSource_DIR ${CPUINFO_SOURCE_DIR} PARENT_SCOPE)
+  set(CpuInfoSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_CpuInfoSource_import)
+
+_CpuInfoSource_import()
diff --git a/infra/cmake/packages/FlatBuffersSource-1.11/FlatBuffersSourceConfig.cmake b/infra/cmake/packages/FlatBuffersSource-1.12/FlatBuffersSourceConfig.cmake
index 92efbf97e..9ee2c49f7 100644
--- a/infra/cmake/packages/FlatBuffersSource-1.11/FlatBuffersSourceConfig.cmake
+++ b/infra/cmake/packages/FlatBuffersSource-1.12/FlatBuffersSourceConfig.cmake
@@ -7,11 +7,11 @@ function(_FlatBuffersSource_import)
   nnas_include(ExternalSourceTools)
   nnas_include(OptionTools)
 
-  envoption(FLATBUFFERS_1_11_URL https://github.com/google/flatbuffers/archive/v1.11.0.tar.gz)
+  envoption(FLATBUFFERS_1_12_URL https://github.com/google/flatbuffers/archive/v1.12.0.tar.gz)
   ExternalSource_Download(FLATBUFFERS
-    DIRNAME FLATBUFFERS-1.11
-    CHECKSUM MD5=02c64880acb89dbd57eebacfd67200d8
-    URL ${FLATBUFFERS_1_11_URL}
+    DIRNAME FLATBUFFERS-1.12
+    CHECKSUM MD5=c62ffefb3d4548b127cca14ce047f16c
+    URL ${FLATBUFFERS_1_12_URL}
   )
 
   set(FlatBuffersSource_DIR ${FLATBUFFERS_SOURCE_DIR} PARENT_SCOPE)
diff --git a/infra/cmake/packages/FlatBuffersSource-1.11/FlatBuffersSourceConfigVersion.cmake b/infra/cmake/packages/FlatBuffersSource-1.12/FlatBuffersSourceConfigVersion.cmake
index f008e0528..8cfdbf8e5 100644
--- a/infra/cmake/packages/FlatBuffersSource-1.11/FlatBuffersSourceConfigVersion.cmake
+++ b/infra/cmake/packages/FlatBuffersSource-1.12/FlatBuffersSourceConfigVersion.cmake
@@ -1,4 +1,4 @@
-set(PACKAGE_VERSION "1.11")
+set(PACKAGE_VERSION "1.12")
 set(PACKAGE_VERSION_EXACT FALSE)
 set(PACKAGE_VERSION_COMPATIBLE FALSE)
 set(PACKAGE_VERSION_UNSUITABLE TRUE)
diff --git a/infra/cmake/packages/NEON2SSESourceConfig.cmake b/infra/cmake/packages/NEON2SSESourceConfig.cmake
index 5970ec73e..bd40267a5 100644
--- a/infra/cmake/packages/NEON2SSESourceConfig.cmake
+++ b/infra/cmake/packages/NEON2SSESourceConfig.cmake
@@ -7,12 +7,13 @@ function(_NEON2SSESource_import)
   nnas_include(ExternalSourceTools)
   nnas_include(OptionTools)
 
-  # NOTE TensorFlow 1.12 downloads NEON2SSE from the following URL
   # NOTE TensorFlow 1.13.1 downloads NEON2SSE from the following URL
+  # NOTE TensorFlow 2.1 downloads NEON2SSE from the following URL
   # NOTE TensorFlow 2.2 downloads NEON2SSE from the following URL
-  envoption(NEON2SSE_1_12_URL https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz)
+  # NOTE TensorFlow 2.3 downloads NEON2SSE from the following URL
+  envoption(NEON2SSE_URL https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz)
 
-  ExternalSource_Download(NEON2SSE ${NEON2SSE_1_12_URL})
+  ExternalSource_Download(NEON2SSE ${NEON2SSE_URL})
 
   set(NEON2SSESource_DIR ${NEON2SSE_SOURCE_DIR} PARENT_SCOPE)
   set(NEON2SSESource_FOUND TRUE PARENT_SCOPE)
diff --git a/infra/cmake/packages/NoniusSourceConfig.cmake b/infra/cmake/packages/NoniusSourceConfig.cmake
index 0af23ef0e..17965f1eb 100644
--- a/infra/cmake/packages/NoniusSourceConfig.cmake
+++ b/infra/cmake/packages/NoniusSourceConfig.cmake
@@ -20,7 +20,7 @@ function(_NoniusSource_import)
   endif(BUILD_KBENCHMARK)
 
   set(NoniusSource_DIR ${NONIUS_SOURCE_DIR} PARENT_SCOPE)
-  set(NoniusSource_FOUND ${NONIUS_SOURCE_GET} PARENT_SCOPE)
+  set(NoniusSource_FOUND TRUE PARENT_SCOPE)
 endfunction(_NoniusSource_import)
 
 _NoniusSource_import()
diff --git a/infra/cmake/packages/RuySourceConfig.cmake b/infra/cmake/packages/RuySourceConfig.cmake
new file mode 100644
index 000000000..4faf0bb9f
--- /dev/null
+++ b/infra/cmake/packages/RuySourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_RuySource_import)
+  if(NOT ${DOWNLOAD_RUY})
+    set(RuySource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${DOWNLOAD_RUY})
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # NOTE Downloads ruy source used by tensorflow v2.3.0
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  envoption(RUY_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.tar.gz)
+  ExternalSource_Download(RUY
+    DIRNAME RUY
+    URL ${RUY_URL})
+
+  set(RuySource_DIR ${RUY_SOURCE_DIR} PARENT_SCOPE)
+  set(RuySource_FOUND TRUE PARENT_SCOPE)
+endfunction(_RuySource_import)
+
+_RuySource_import()
diff --git a/infra/cmake/packages/TensorFlowEigenSource-2.3.0-rc0Config.cmake b/infra/cmake/packages/TensorFlowEigenSource-2.3.0-rc0Config.cmake
deleted file mode 100644
index 207f7b5bd..000000000
--- a/infra/cmake/packages/TensorFlowEigenSource-2.3.0-rc0Config.cmake
+++ /dev/null
@@ -1,23 +0,0 @@
-function(_TensorFlowEigenSource_import)
-  if(NOT DOWNLOAD_EIGEN)
-    set(TensorFlowEigenSource_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT DOWNLOAD_EIGEN)
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # NOTE TensorFlow 2.3.0-rc0 uses the following URL
-  envoption(EXTERNAL_DOWNLOAD_SERVER "https://gitlab.com")
-  envoption(TENSORFLOW_2_3_0_EIGEN_URL ${EXTERNAL_DOWNLOAD_SERVER}/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz)
-
-  ExternalSource_Download(EIGEN
-    DIRNAME TENSORFLOW-2.3.0-EIGEN
-    URL ${TENSORFLOW_2_3_0_EIGEN_URL}
-)
-
-  set(TensorFlowEigenSource_DIR ${EIGEN_SOURCE_DIR} PARENT_SCOPE)
-  set(TensorFlowEigenSource_FOUND TRUE PARENT_SCOPE)
-endfunction(_TensorFlowEigenSource_import)
-
-_TensorFlowEigenSource_import()
diff --git a/infra/cmake/packages/TensorFlowLite-1.12/Lite/CMakeLists.txt b/infra/cmake/packages/TensorFlowLite-1.12/Lite/CMakeLists.txt
deleted file mode 100644
index 337d6b24f..000000000
--- a/infra/cmake/packages/TensorFlowLite-1.12/Lite/CMakeLists.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# NOTE The followings SHOULD be defined before using this CMakeLists.txt
-#
-#  'TensorFlowSource_DIR' variable
-#  'FlatBuffersSource_DIR' variable
-#  'eigen' target
-#  'gemmlowp' target
-#  'neon2sse' target
-#  'farmhash' target
-#  'abseil' target
-#
-message(STATUS "Build TensorFlow Lite from ${TensorFlowSource_DIR}")
-
-set(TensorFlowLiteSource_DIR ${TensorFlowSource_DIR}/tensorflow/contrib/lite)
-
-file(GLOB CORE_SRCS "${TensorFlowLiteSource_DIR}/*.c" "${TensorFlowLiteSource_DIR}/*.cc" "${TensorFlowLiteSource_DIR}/c/*.c" "${TensorFlowLiteSource_DIR}/core/api/*.cc")
-file(GLOB_RECURSE CORE_TESTS "${TensorFlowLiteSource_DIR}/*test*.cc")
-list(REMOVE_ITEM CORE_SRCS ${CORE_TESTS})
-
-file(GLOB_RECURSE KERNEL_SRCS "${TensorFlowLiteSource_DIR}/kernels/*.cc")
-file(GLOB_RECURSE KERNEL_TESTS "${TensorFlowLiteSource_DIR}/kernels/*test*.cc")
-list(REMOVE_ITEM KERNEL_SRCS ${KERNEL_TESTS})
-# Exclude buggy kernel(s) from the build
-#list(REMOVE_ITEM KERNEL_SRCS "${TensorFlowLiteSource_DIR}/kernels/internal/spectrogram.cc")
-
-list(APPEND SRCS ${CORE_SRCS})
-list(APPEND SRCS ${KERNEL_SRCS})
-
-include(CheckCXXCompilerFlag)
-
-CHECK_CXX_COMPILER_FLAG(-Wno-extern-c-compat COMPILER_SUPPORT_EXTERN_C_COMPAT_WARNING)
-
-add_library(tensorflowlite-1.12 ${SRCS})
-set_target_properties(tensorflowlite-1.12 PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(tensorflowlite-1.12 PUBLIC ${TensorFlowSource_DIR})
-target_include_directories(tensorflowlite-1.12 PUBLIC ${FlatBuffersSource_DIR}/include)
-target_compile_options(tensorflowlite-1.12 PUBLIC -Wno-ignored-attributes)
-if(COMPILER_SUPPORT_EXTERN_C_COMPAT_WARNING)
-  target_compile_options(tensorflowlite-1.12 PUBLIC -Wno-extern-c-compat)
-endif(COMPILER_SUPPORT_EXTERN_C_COMPAT_WARNING)
-target_compile_definitions(tensorflowlite-1.12 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK")
-target_link_libraries(tensorflowlite-1.12 eigen-fd6845384b86 gemmlowp neon2sse farmhash abseil dl)
diff --git a/infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfig.cmake b/infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfig.cmake
deleted file mode 100644
index ff15d8576..000000000
--- a/infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfig.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-function(_TensorFlowLite_import)
-  nnas_find_package(TensorFlowSource EXACT 1.12 QUIET)
-
-  if(NOT TensorFlowSource_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT TensorFlowSource_FOUND)
-
-  # TensorFlow 1.12 downloads FlatBuffers from https://github.com/google/flatbuffers/archive/1f5eae5d6a1.tar.gz
-  #
-  # Let's use 1.10 released in 2018.10 (compatible with 1f5eae5d6a1).
-  nnas_find_package(FlatBuffersSource EXACT 1.10 QUIET)
-
-  if(NOT FlatBuffersSource_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT FlatBuffersSource_FOUND)
-
-  nnas_find_package(Farmhash QUIET)
-
-  if(NOT Farmhash_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT Farmhash_FOUND)
-
-  nnas_find_package(Eigen-fd6845384b86 QUIET)
-
-  if(NOT Eigen-fd6845384b86_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT Eigen-fd6845384b86_FOUND)
-
-  nnas_find_package(GEMMLowp QUIET)
-
-  if(NOT GEMMLowp_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT GEMMLowp_FOUND)
-
-  nnas_find_package(NEON2SSE QUIET)
-
-  if(NOT NEON2SSE_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT NEON2SSE_FOUND)
-
-  nnas_find_package(Abseil QUIET)
-
-  if(NOT Abseil_FOUND)
-    set(TensorFlowLite_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT Abseil_FOUND)
-
-  if(NOT TARGET tensorflowlite-1.12)
-    nnas_include(ExternalProjectTools)
-    add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/Lite" tflite-1.12)
-  endif(NOT TARGET tensorflowlite-1.12)
-
-  set(TensorFlowLite_FOUND TRUE PARENT_SCOPE)
-endfunction(_TensorFlowLite_import)
-
-_TensorFlowLite_import()
diff --git a/infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfigVersion.cmake b/infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfigVersion.cmake
deleted file mode 100644
index 4a57b655b..000000000
--- a/infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfigVersion.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-set(PACKAGE_VERSION "1.12")
-set(PACKAGE_VERSION_EXACT FALSE)
-set(PACKAGE_VERSION_COMPATIBLE FALSE)
-set(PACKAGE_VERSION_UNSUITABLE TRUE)
-
-if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
-  set(PACKAGE_VERSION_EXACT TRUE)
-  set(PACKAGE_VERSION_UNSUITABLE FALSE)
-endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/command/build-docker-image b/infra/command/build-docker-image
index 7653a0c88..933701b81 100644
--- a/infra/command/build-docker-image
+++ b/infra/command/build-docker-image
@@ -13,7 +13,7 @@ function Usage()
 DOCKER_FILE_RPATH_BASE="infra/docker/Dockerfile"
 DOCKER_BUILD_ARGS=()
 DOCKER_FILE_RPATH=${DOCKER_FILE_RPATH_BASE}
-DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnas}
+DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnfw/nnas}
 
 while [[ $# -gt 0 ]]
 do
diff --git a/infra/command/docker-run b/infra/command/docker-run
index 08610bff2..9a186b3d2 100644
--- a/infra/command/docker-run
+++ b/infra/command/docker-run
@@ -1,10 +1,19 @@
 #!/bin/bash
 
 import "docker.configuration"
+USER_MODE=0
+
+if [[ $1 == '--user' ]]; then
+  DOCKER_RUN_OPTS+=" -u $(stat -c "%u" $NNAS_PROJECT_PATH):$(stat -c "%g" $NNAS_PROJECT_PATH)"
+  USER_MODE=1
+  shift
+fi
 
 docker run ${DOCKER_RUN_OPTS} ${DOCKER_ENV_VARS} ${DOCKER_VOLUMES} ${DOCKER_IMAGE_NAME} "$@"
 EXITCODE=$?
 
-docker_cleanup
+if [ $USER_MODE -eq 0 ]; then
+  docker_cleanup
+fi
 
 exit ${EXITCODE}
diff --git a/infra/config/docker.configuration b/infra/config/docker.configuration
index 08931cd28..e5eb85087 100644
--- a/infra/config/docker.configuration
+++ b/infra/config/docker.configuration
@@ -3,7 +3,7 @@
 # Don't run this script
 [[ "${BASH_SOURCE[0]}" == "${0}" ]] && echo "Please don't execute ${BASH_SOURCE[0]}" && exit 1
 
-DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnas}
+DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnfw/nnas}
 echo "Using docker image ${DOCKER_IMAGE_NAME}"
 
 if [ -z "`docker images ${DOCKER_IMAGE_NAME}`" ]; then
diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804
index cc31bba1f..f85c2351c 100644
--- a/infra/docker/Dockerfile.1804
+++ b/infra/docker/Dockerfile.1804
@@ -1,3 +1,18 @@
+# Copyright 2016-2020 Jing Li
+# Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 FROM ubuntu:18.04
 
 ARG UBUNTU_MIRROR
@@ -37,5 +52,48 @@ RUN wget http://download.tizen.org/sdk/tizenstudio/official/binary/sdb_3.1.4_ubu
 RUN unzip -d tmp sdb.zip && rm sdb.zip
 RUN cp tmp/data/tools/sdb /usr/bin/. && rm -rf tmp
 
+# Install java
+RUN apt-get install -y --no-install-recommends openjdk-8-jdk
+
+# download and install Gradle
+# https://services.gradle.org/distributions/
+ARG GRADLE_VERSION=6.4.1
+ARG GRADLE_DIST=bin
+RUN cd /opt && \
+    wget -q https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-${GRADLE_DIST}.zip && \
+    unzip gradle*.zip && \
+    ls -d */ | sed 's/\/*$//g' | xargs -I{} mv {} gradle && \
+    rm gradle*.zip
+
+# download and install Android SDK
+# https://developer.android.com/studio#command-tools
+ARG ANDROID_SDK_VERSION=6514223
+ENV ANDROID_SDK_ROOT /opt/android-sdk
+RUN mkdir -p ${ANDROID_SDK_ROOT}/cmdline-tools && \
+    wget -q https://dl.google.com/android/repository/commandlinetools-linux-${ANDROID_SDK_VERSION}_latest.zip && \
+    unzip *tools*linux*.zip -d ${ANDROID_SDK_ROOT}/cmdline-tools && \
+    rm *tools*linux*.zip
+
+# accept the license agreements of the SDK components
+RUN mkdir -p ${ANDROID_SDK_ROOT}/licenses
+RUN echo 24333f8a63b6825ea9c5514f83c2829b004d1fee > ${ANDROID_SDK_ROOT}/licenses/android-sdk-license
+RUN echo d56f5187479451eabf01fb78af6dfcb131a6481e >> ${ANDROID_SDK_ROOT}/licenses/android-sdk-license
+
+# Env variable for gradle build
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
+ENV GRADLE_HOME /opt/gradle
+ENV PATH ${PATH}:${GRADLE_HOME}/bin:${ANDROID_SDK_ROOT}/cmdline-tools/tools/bin:${ANDROID_SDK_ROOT}/platform-tools
+ENV ANDROID_HOME ${ANDROID_SDK_ROOT}
+
+# Install NDK
+RUN sdkmanager --install "ndk;20.0.5594570"
+RUN sdkmanager "platform-tools"
+
+# Env for ko encoding build
+ENV LC_ALL "C.UTF-8"
+
+# setup adb server
+EXPOSE 5037
+
 # Clean archives (to reduce image size)
 RUN apt-get clean -y
diff --git a/infra/nncc/config/docker.configuration b/infra/nncc/config/docker.configuration
index 7078585a2..25c89ac60 100644
--- a/infra/nncc/config/docker.configuration
+++ b/infra/nncc/config/docker.configuration
@@ -1,4 +1,4 @@
-DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnas}
+DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnfw/nnas}
 echo "Using docker image ${DOCKER_IMAGE_NAME}"
 
 if [ -z "`docker images ${DOCKER_IMAGE_NAME}`" ]; then
diff --git a/infra/nnfw/cmake/CfgOptionFlags.cmake b/infra/nnfw/cmake/CfgOptionFlags.cmake
index b3d058164..450aa21ab 100644
--- a/infra/nnfw/cmake/CfgOptionFlags.cmake
+++ b/infra/nnfw/cmake/CfgOptionFlags.cmake
@@ -69,12 +69,14 @@ option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" ON)
 option(DOWNLOAD_NONIUS "Download nonius source" ON)
 option(DOWNLOAD_BOOST "Download boost source" OFF)
 option(DOWNLOAD_RUY "Download ruy source" ON)
+option(DOWNLOAD_CPUINFO "Download cpuinfo source" ON)
+option(DOWNLOAD_GTEST "Download Google Test source and build Google Test" ON)
 option(BUILD_BOOST "Build boost source" OFF)
 option(BUILD_TENSORFLOW_LITE "Build TensorFlow Lite from the downloaded source" ON)
 option(BUILD_TENSORFLOW_LITE_2_3_0 "Build TensorFlow Lite 2.3.0 from the downloaded source" OFF)
-option(BUILD_GTEST "Download and build Google Test" ON)
 option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" ON)
 option(BUILD_RUY "Build ruy library from the downloaded source" ON)
+option(BUILD_CPUINFO "Build cpuinfo library from the downloaded source" ON)
 option(PROFILE_RUY "Enable ruy library profiling" OFF)
 
 #
diff --git a/infra/nnfw/cmake/buildtool/config/config_x86_64-tizen.cmake b/infra/nnfw/cmake/buildtool/config/config_x86_64-tizen.cmake
new file mode 100644
index 000000000..0f304ecf3
--- /dev/null
+++ b/infra/nnfw/cmake/buildtool/config/config_x86_64-tizen.cmake
@@ -0,0 +1,17 @@
+#
+# aarch64 tizen compile options
+#
+
+message(STATUS "Building for AARCH64 Tizen")
+
+# Build flag for tizen
+set(CMAKE_C_FLAGS_DEBUG     "-O -g -DDEBUG")
+set(CMAKE_CXX_FLAGS_DEBUG   "-O -g -DDEBUG")
+
+# TODO : add and use option_tizen if something uncommon comes up
+# include linux common
+include("cmake/buildtool/config/config_linux.cmake")
+
+# addition for aarch64-tizen
+set(FLAGS_COMMON ${FLAGS_COMMON}
+    )
diff --git a/infra/nnfw/cmake/options/options_aarch64-android.cmake b/infra/nnfw/cmake/options/options_aarch64-android.cmake
index d720b202a..d8eceef35 100644
--- a/infra/nnfw/cmake/options/options_aarch64-android.cmake
+++ b/infra/nnfw/cmake/options/options_aarch64-android.cmake
@@ -1,6 +1,5 @@
 # aarch64 android cmake options
 #
-option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" OFF)
 # NOTE BUILD_ANDROID_TFLITE(JNI lib) is disabled due to BuiltinOpResolver issue.
 # tensorflow-lite does not build BuiltinOpResolver but JNI lib need it
 # Related Issue : #1403
@@ -13,6 +12,6 @@ option(BUILD_BOOST "Build boost source" ON)
 option(BUILD_RUNTIME_NNAPI_TEST "Build Runtime NN API Generated Test" OFF)
 option(BUILD_NNAPI_TEST "Build nnapi_test" OFF)
 option(BUILD_NNPACKAGE_RUN "Build nnpackge_run" ON)
-option(BUILD_TFLITE_RUN "Build tflite-run" OFF)
+option(BUILD_TFLITE_RUN "Build tflite-run" ON)
 option(BUILD_TFLITE_LOADER_TEST_TOOL "Build tflite loader testing tool" OFF)
 option(BUILD_LOGGING "Build logging runtime" OFF)
diff --git a/infra/nnfw/cmake/options/options_x86_64-tizen.cmake b/infra/nnfw/cmake/options/options_x86_64-tizen.cmake
new file mode 100644
index 000000000..bf8b2809e
--- /dev/null
+++ b/infra/nnfw/cmake/options/options_x86_64-tizen.cmake
@@ -0,0 +1,10 @@
+#
+# x86_64 linux cmake options
+#
+option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" OFF)
+option(BUILD_TENSORFLOW_LITE "Build TensorFlow Lite from the downloaded source" OFF)
+option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" OFF)
+
+option(BUILD_LOGGING "Build logging runtime" OFF)
+option(GENERATE_RUNTIME_NNAPI_TESTS "Generate NNAPI operation gtest" OFF)
+option(ENVVAR_ONERT_CONFIG "Use environment variable for onert configuration" OFF)
diff --git a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
index 67f6ef8a2..1b5a32ef6 100644
--- a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
+++ b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
@@ -1,7 +1,7 @@
 function(_ARMCompute_Import)
   include(FindPackageHandleStandardArgs)
 
-  list(APPEND ARMCompute_LIB_SEARCH_PATHS ${ARMCompute_PREFIX})
+  list(APPEND ARMCompute_LIB_SEARCH_PATHS ${ARMCompute_PREFIX}/lib)
 
   find_path(INCLUDE_DIR NAMES arm_compute/core/ITensor.h PATHS ${ARMCompute_INCLUDE_SEARCH_PATHS})
 
@@ -62,34 +62,19 @@ function(_ARMCompute_Import)
   set(ARMCompute_FOUND TRUE PARENT_SCOPE)
 endfunction(_ARMCompute_Import)
 
-### Check whether library exists
-function(_ARMCompute_Check VAR LIBDIR)
-  set(FOUND TRUE)
-
-  if(NOT EXISTS "${LIBDIR}/libarm_compute_core.so")
-    set(FOUND FALSE)
-  endif()
-
-  if(NOT EXISTS "${LIBDIR}/libarm_compute.so")
-    set(FOUND FALSE)
-  endif()
-
-  if(NOT EXISTS "${LIBDIR}/libarm_compute_graph.so")
-    set(FOUND FALSE)
-  endif()
-
-  set(${VAR} ${FOUND} PARENT_SCOPE)
-endfunction(_ARMCompute_Check)
-
 # Let's build and install ARMCompute libraries
-# NOTE This function silently returns on error
-function(_ARMCompute_Build ARMCompute_INSTALL_PREFIX)
-  ### Check whether library exists
-  _ARMCompute_Check(ARMCompute_FOUND ${ARMCompute_INSTALL_PREFIX})
-
-  if(ARMCompute_FOUND)
-    return()
-  endif(ARMCompute_FOUND)
+function(_ARMCompute_Build ARMComputeInstall_DIR)
+  set(PKG_NAME "ARMCOMPUTE")
+  set(PKG_IDENTIFIER "20.05")
+  set(INSTALL_STAMP_PATH "${ARMComputeInstall_DIR}/${PKG_NAME}.stamp")
+  set(ARMComputeBuild_DIR "${CMAKE_BINARY_DIR}/externals/armcompute")
+
+  if(EXISTS ${INSTALL_STAMP_PATH})
+    file(READ ${INSTALL_STAMP_PATH} READ_IDENTIFIER)
+    if("${READ_IDENTIFIER}" STREQUAL "${PKG_IDENTIFIER}")
+      return()
+    endif("${READ_IDENTIFIER}" STREQUAL "${PKG_IDENTIFIER}")
+  endif(EXISTS ${INSTALL_STAMP_PATH})
 
   ### Let's build with SCONS
   nnas_find_package(ARMComputeSource QUIET)
@@ -112,6 +97,9 @@ function(_ARMCompute_Build ARMCompute_INSTALL_PREFIX)
   endif(CMAKE_BUILD_TYPE)
 
   #### Architecture-specific configurations
+
+  #### BUILD_DIR is in source tree to reduce CI build overhead
+  #### TODO Change BUILD_DIR to ${ARMComputeBuild_DIR}
   if(TARGET_ARCH STREQUAL "armv7l")
     set(BUILD_ARCH "armv7a")
     set(BUILD_DIR "${BUILD_ARCH}-${TARGET_OS}.${SCON_BUILD_TYPE}")
@@ -137,12 +125,12 @@ function(_ARMCompute_Build ARMCompute_INSTALL_PREFIX)
   list(APPEND SCONS_OPTIONS "Werror=0")
   list(APPEND SCONS_OPTIONS "os=${TARGET_OS}")
 
-  if(DEFINED ACL_BUILD_THREADS)
-    set(N ${ACL_BUILD_THREADS})
-  else(DEFINED ACL_BUILD_THREADS)
+  if(DEFINED EXTERNALS_BUILD_THREADS)
+    set(N ${EXTERNALS_BUILD_THREADS})
+  else(DEFINED EXTERNALS_BUILD_THREADS)
     include(ProcessorCount)
     ProcessorCount(N)
-  endif(DEFINED ACL_BUILD_THREADS)
+  endif(DEFINED EXTERNALS_BUILD_THREADS)
 
   if((NOT N EQUAL 0) AND BUILD_EXT_MULTITHREAD)
     list(APPEND SCONS_OPTIONS -j${N})
@@ -155,26 +143,34 @@ function(_ARMCompute_Build ARMCompute_INSTALL_PREFIX)
     list(APPEND SCONS_OPTIONS "build_dir=${BUILD_DIR}")
   endif(DEFINED BUILD_DIR)
 
+  list(APPEND SCONS_OPTIONS "install_dir=${ARMComputeInstall_DIR}")
+
+  set(SCONS_CC "gcc")
+  set(SCONS_CXX "g++")
+  if(ANDROID)
+    list(APPEND SCONS_OPTIONS "toolchain_prefix=${ANDROID_TOOLCHAIN_PREFIX}")
+    list(APPEND SCONS_OPTIONS "compiler_prefix=${ANDROID_TOOLCHAIN_ROOT}/bin/aarch64-linux-android${ANDROID_API_LEVEL}-")
+    set(SCONS_CC "clang")
+    set(SCONS_CXX "clang++")
+  endif(ANDROID)
+
   message(STATUS "Build ARMCompute with ${SCONS_PATH} ('${SCONS_OPTIONS}'")
 
   # Build ARMCompute libraries with SCONS
-  # NOTE ARMCompute SConstruct unconditioanlly appends "arm-linux-gnueabihf-" prefix for linux
-  execute_process(COMMAND /usr/bin/env CC=gcc CXX=g++ "${SCONS_PATH}" ${SCONS_OPTIONS}
+  # NOTE ARMCompute build process don't allow logging by using OUTPUT_FILE and ERROR_FILE option
+  execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${ARMComputeInstall_DIR}")
+  execute_process(COMMAND /usr/bin/env CC=${SCONS_CC} CXX=${SCONS_CXX} "${SCONS_PATH}" ${SCONS_OPTIONS}
                   WORKING_DIRECTORY ${ARMComputeSource_DIR}
-                  RESULT_VARIABLE ARMCompute_BUILD)
+                  RESULT_VARIABLE BUILD_EXITCODE)
 
-  # Install ARMCompute libraries to overlay
-  execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${ARMCompute_INSTALL_PREFIX}"
-                  WORKING_DIRECTORY ${ARMComputeSource_DIR}
-                  RESULT_VARIABLE ARMCompute_BUILD)
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "build/${BUILD_DIR}/libarm_compute_core.so" "${ARMCompute_INSTALL_PREFIX}"
-                  COMMAND ${CMAKE_COMMAND} -E copy "build/${BUILD_DIR}/libarm_compute.so" "${ARMCompute_INSTALL_PREFIX}"
-                  COMMAND ${CMAKE_COMMAND} -E copy "build/${BUILD_DIR}/libarm_compute_graph.so" "${ARMCompute_INSTALL_PREFIX}"
-                  WORKING_DIRECTORY ${ARMComputeSource_DIR}
-                  RESULT_VARIABLE ARMCompute_BUILD)
+  if(NOT BUILD_EXITCODE EQUAL 0)
+    message(FATAL_ERROR "${PKG_NAME} Package: Build and install failed (check '${BUILD_LOG_PATH}' for details)")
+  endif(NOT BUILD_EXITCODE EQUAL 0)
+
+  file(WRITE "${INSTALL_STAMP_PATH}" "${PKG_IDENTIFIER}")
 endfunction(_ARMCompute_Build)
 
-set(ARMCompute_PREFIX ${EXT_OVERLAY_DIR}/lib)
+set(ARMCompute_PREFIX ${EXT_OVERLAY_DIR})
 if(BUILD_ARMCOMPUTE)
   _ARMCompute_Build("${ARMCompute_PREFIX}")
 endif(BUILD_ARMCOMPUTE)
diff --git a/infra/nnfw/cmake/packages/BoostConfig.cmake b/infra/nnfw/cmake/packages/BoostConfig.cmake
index 4f60e9107..f2759f8e1 100644
--- a/infra/nnfw/cmake/packages/BoostConfig.cmake
+++ b/infra/nnfw/cmake/packages/BoostConfig.cmake
@@ -16,6 +16,18 @@ function(_Boost_Build Boost_PREFIX)
   set(BoostBuild_DIR ${CMAKE_BINARY_DIR}/externals/boost)
   set(BoostInstall_DIR ${Boost_PREFIX})
 
+  set(INSTALL_STAMP_PATH "${BoostInstall_DIR}/BOOST.stamp")
+  set(BUILD_LOG_PATH "${BoostBuild_DIR}/BOOST.log")
+  set(PKG_NAME "BOOST")
+  set(PKG_IDENTIFIER "1.58.0")
+
+  if(EXISTS ${INSTALL_STAMP_PATH})
+    file(READ ${INSTALL_STAMP_PATH} READ_IDENTIFIER)
+    if("${READ_IDENTIFIER}" STREQUAL "${PKG_IDENTIFIER}")
+      return()
+    endif("${READ_IDENTIFIER}" STREQUAL "${PKG_IDENTIFIER}")
+  endif(EXISTS ${INSTALL_STAMP_PATH})
+
   unset(Boost_Options)
 
   list(APPEND Boost_Options --build-dir=${BoostBuild_DIR})
@@ -25,6 +37,17 @@ function(_Boost_Build Boost_PREFIX)
   list(APPEND Boost_Options --with-system)
   list(APPEND Boost_Options --with-filesystem)
 
+  if(DEFINED EXTERNALS_BUILD_THREADS)
+    set(N ${EXTERNALS_BUILD_THREADS})
+  else(DEFINED EXTERNALS_BUILD_THREADS)
+    include(ProcessorCount)
+    ProcessorCount(N)
+  endif(DEFINED EXTERNALS_BUILD_THREADS)
+
+  if((NOT N EQUAL 0) AND BUILD_EXT_MULTITHREAD)
+    list(APPEND Boost_Options -j${N})
+  endif()
+
   set(JAM_FILENAME ${BoostBuild_DIR}/user-config.jam)
 
   if(ANDROID)
@@ -41,7 +64,15 @@ function(_Boost_Build Boost_PREFIX)
   # Install Boost libraries
   execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${BoostInstall_DIR}")
   execute_process(COMMAND /usr/bin/env BOOST_BUILD_PATH="${BoostBuild_DIR}" ${BoostSource_DIR}/b2 install ${Boost_Options}
-                  WORKING_DIRECTORY ${BoostSource_DIR})
+                  WORKING_DIRECTORY ${BoostSource_DIR}
+                  OUTPUT_FILE ${BUILD_LOG_PATH}
+                  RESULT_VARIABLE BUILD_EXITCODE)
+
+  if(NOT BUILD_EXITCODE EQUAL 0)
+    message(FATAL_ERROR "${PKG_NAME} Package: Build and install failed (check '${BUILD_LOG_PATH}' for details)")
+  endif(NOT BUILD_EXITCODE EQUAL 0)
+
+  file(WRITE "${INSTALL_STAMP_PATH}" "${PKG_IDENTIFIER}")
 
 endfunction(_Boost_Build)
 
diff --git a/infra/nnfw/cmake/packages/CpuInfoConfig.cmake b/infra/nnfw/cmake/packages/CpuInfoConfig.cmake
new file mode 100644
index 000000000..408cf8510
--- /dev/null
+++ b/infra/nnfw/cmake/packages/CpuInfoConfig.cmake
@@ -0,0 +1,31 @@
+function(_CpuInfo_Build)
+  nnas_find_package(CpuInfoSource QUIET)
+
+  # NOTE This line prevents multiple definitions of cpuinfo target
+  if(TARGET cpuinfo)
+    set(CpuInfoSource_DIR ${CpuInfoSource_DIR} PARENT_SCOPE)
+    set(CpuInfo_FOUND TRUE PARENT_SCOPE)
+    return()
+  endif(TARGET cpuinfo)
+
+  if(NOT CpuInfoSource_FOUND)
+    message(STATUS "CPUINFO: Source not found")
+    set(CpuInfo_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT CpuInfoSource_FOUND)
+
+  set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL "Build command-line tools")
+  set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "Build cpuinfo unit tests")
+  set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "Build cpuinfo mock tests")
+  set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "Build cpuinfo micro-benchmarks")
+  add_extdirectory("${CpuInfoSource_DIR}" cpuinfo EXCLUDE_FROM_ALL)
+  set_target_properties(cpuinfo PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  set(CpuInfoSource_DIR ${CpuInfoSource_DIR} PARENT_SCOPE)
+  set(CpuInfo_FOUND TRUE PARENT_SCOPE)
+endfunction(_CpuInfo_Build)
+
+if(BUILD_CPUINFO)
+  _CpuInfo_Build()
+else(BUILD_CPUINFO)
+  set(CpuInfo_FOUND FALSE)
+endif(BUILD_CPUINFO)
diff --git a/infra/nnfw/cmake/packages/FarmhashSourceConfig.cmake b/infra/nnfw/cmake/packages/FarmhashSourceConfig.cmake
deleted file mode 100644
index ab53f97b2..000000000
--- a/infra/nnfw/cmake/packages/FarmhashSourceConfig.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-function(_FarmhashSource_import)
-  if(NOT ${DOWNLOAD_FARMHASH})
-    set(FarmhashSource_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT ${DOWNLOAD_FARMHASH})
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # NOTE TensorFlow 1.12 downloads farmhash from the following URL
-  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(FARMHASH_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz)
-  ExternalSource_Download("farmhash" ${FARMHASH_URL})
-
-  set(FarmhashSource_DIR ${farmhash_SOURCE_DIR} PARENT_SCOPE)
-  set(FarmhashSource_FOUND ${farmhash_SOURCE_GET} PARENT_SCOPE)
-endfunction(_FarmhashSource_import)
-
-_FarmhashSource_import()
diff --git a/infra/nnfw/cmake/packages/FlatBuffersConfig.cmake b/infra/nnfw/cmake/packages/FlatBuffersConfig.cmake
index 13ad1113a..d27ac1435 100644
--- a/infra/nnfw/cmake/packages/FlatBuffersConfig.cmake
+++ b/infra/nnfw/cmake/packages/FlatBuffersConfig.cmake
@@ -6,8 +6,8 @@ function(_FlatBuffers_import)
     return()
   endif(Flatbuffers_FOUND)
 
-  # NOTE Tizen uses 1.11
-  nnas_find_package(FlatBuffersSource EXACT 1.11 QUIET)
+  # NOTE Tizen uses 1.12
+  nnas_find_package(FlatBuffersSource EXACT 1.12 QUIET)
 
   if(NOT FlatBuffersSource_FOUND)
     set(FlatBuffers_FOUND FALSE PARENT_SCOPE)
diff --git a/infra/nnfw/cmake/packages/GEMMLowpConfig.cmake b/infra/nnfw/cmake/packages/GEMMLowpConfig.cmake
index ddfcc787e..b321961ca 100644
--- a/infra/nnfw/cmake/packages/GEMMLowpConfig.cmake
+++ b/infra/nnfw/cmake/packages/GEMMLowpConfig.cmake
@@ -1,5 +1,5 @@
 function(_GEMMLowp_import)
-  nnfw_find_package(GEMMLowpSource QUIET)
+  nnas_find_package(GEMMLowpSource QUIET)
 
   if(NOT GEMMLowpSource_FOUND)
     set(GEMMLowp_FOUND FALSE PARENT_SCOPE)
diff --git a/infra/nnfw/cmake/packages/GEMMLowpSourceConfig.cmake b/infra/nnfw/cmake/packages/GEMMLowpSourceConfig.cmake
deleted file mode 100644
index 97c8e0597..000000000
--- a/infra/nnfw/cmake/packages/GEMMLowpSourceConfig.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-function(_GEMMLowpSource_import)
-  if(NOT ${DOWNLOAD_GEMMLOWP})
-    set(GEMMLowpSource_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT ${DOWNLOAD_GEMMLOWP})
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # NOTE TensorFlow 1.12 uses the following URL
-  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(GEMMLOWP_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.tar.gz)
-  ExternalSource_Download("gemmlowp" ${GEMMLOWP_URL})
-
-  set(GEMMLowpSource_DIR ${gemmlowp_SOURCE_DIR} PARENT_SCOPE)
-  set(GEMMLowpSource_FOUND ${gemmlowp_SOURCE_GET} PARENT_SCOPE)
-endfunction(_GEMMLowpSource_import)
-
-_GEMMLowpSource_import()
diff --git a/infra/nnfw/cmake/packages/GTestConfig.cmake b/infra/nnfw/cmake/packages/GTestConfig.cmake
index f3aadf998..54695531e 100644
--- a/infra/nnfw/cmake/packages/GTestConfig.cmake
+++ b/infra/nnfw/cmake/packages/GTestConfig.cmake
@@ -1,23 +1,19 @@
-if(${BUILD_GTEST})
-  nnas_include(ExternalSourceTools)
-  nnas_include(ExternalProjectTools)
-  nnas_include(OptionTools)
+if(${DOWNLOAD_GTEST})
+  nnas_find_package(GTestSource QUIET)
 
-  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(GTEST_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/googletest/archive/release-1.8.0.tar.gz)
-  ExternalSource_Download("gtest" ${GTEST_URL})
-
-  # gtest_SOURCE_DIR is used in gtest subdirectorty's cmake
-  set(sourcedir_gtest ${gtest_SOURCE_DIR})
-  unset(gtest_SOURCE_DIR)
+  if(NOT GTestSource_FOUND)
+    set(GTest_FOUND FALSE)
+    return()
+  endif(NOT GTestSource_FOUND)
 
   if(NOT TARGET gtest_main)
-    add_extdirectory(${sourcedir_gtest} gtest EXCLUDE_FROM_ALL)
+    nnas_include(ExternalProjectTools)
+    add_extdirectory(${GTestSource_DIR} gtest EXCLUDE_FROM_ALL)
   endif(NOT TARGET gtest_main)
 
   set(GTest_FOUND TRUE)
   return()
-endif(${BUILD_GTEST})
+endif(${DOWNLOAD_GTEST})
 
 ### Find and use pre-installed Google Test
 find_package(GTest)
diff --git a/infra/nnfw/cmake/packages/NEON2SSESourceConfig.cmake b/infra/nnfw/cmake/packages/NEON2SSESourceConfig.cmake
deleted file mode 100644
index 7bae616e7..000000000
--- a/infra/nnfw/cmake/packages/NEON2SSESourceConfig.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-function(_NEON2SSESource_import)
-  if(NOT ${DOWNLOAD_NEON2SSE})
-    set(NEON2SSESource_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT ${DOWNLOAD_NEON2SSE})
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # NOTE TensorFlow 1.12 downloads NEON2SSE from the following URL
-  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(NEON2SSE_URL ${EXTERNAL_DOWNLOAD_SERVER}/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz)
-  ExternalSource_Download("neon_2_sse" ${NEON2SSE_URL})
-
-  set(NEON2SSESource_DIR ${neon_2_sse_SOURCE_DIR} PARENT_SCOPE)
-  set(NEON2SSESource_FOUND ${neon_2_sse_SOURCE_GET} PARENT_SCOPE)
-endfunction(_NEON2SSESource_import)
-
-_NEON2SSESource_import()
diff --git a/infra/nnfw/cmake/packages/Ruy/CMakeLists.txt b/infra/nnfw/cmake/packages/Ruy/CMakeLists.txt
index f4d9f8881..9140a17a7 100644
--- a/infra/nnfw/cmake/packages/Ruy/CMakeLists.txt
+++ b/infra/nnfw/cmake/packages/Ruy/CMakeLists.txt
@@ -13,6 +13,8 @@ list(REMOVE_ITEM RUY_SRCS "${RUY_BASE}/example.cc")
 list(REMOVE_ITEM RUY_SRCS "${RUY_BASE}/example_advanced.cc")
 list(REMOVE_ITEM RUY_SRCS "${RUY_BASE}/tune_tool.cc")
 list(REMOVE_ITEM RUY_SRCS "${RUY_BASE}/pmu.cc")
+list(REMOVE_ITEM RUY_SRCS "${RUY_BASE}/create_trmul_params.cc")
+list(REMOVE_ITEM RUY_SRCS "${RUY_BASE}/prepare_packed_matrices.cc")
 
 list(APPEND RUY_INSTRUMENTATION_SRCS "${RUY_BASE}/profiler/instrumentation.cc")
 
@@ -27,6 +29,10 @@ add_library(ruy STATIC ${RUY_SRCS})
 target_include_directories(ruy SYSTEM PUBLIC ${RUY_INCLUDES})
 target_compile_options(ruy PRIVATE -O3)
 
+target_include_directories(ruy PRIVATE ${CpuInfoSource_DIR})
+target_link_libraries(ruy PRIVATE cpuinfo)
+target_compile_definitions(ruy PRIVATE RUY_HAVE_CPUINFO)
+
 add_library(ruy_instrumentation ${RUY_INSTRUMENTATION_SRCS})
 target_include_directories(ruy_instrumentation SYSTEM PUBLIC ${RUY_INCLUDES})
 target_compile_options(ruy_instrumentation PRIVATE -O3)
diff --git a/infra/nnfw/cmake/packages/RuyConfig.cmake b/infra/nnfw/cmake/packages/RuyConfig.cmake
index 278e33cb3..4e7cc24ac 100644
--- a/infra/nnfw/cmake/packages/RuyConfig.cmake
+++ b/infra/nnfw/cmake/packages/RuyConfig.cmake
@@ -1,22 +1,31 @@
-function(_Ruy_import)
+function(_Ruy_Build)
   # NOTE This line prevents multiple definitions of ruy target
   if(TARGET ruy)
-    set(Ruy_FOUND TRUE)
+    set(Ruy_FOUND TRUE PARENT_SCOPE)
     return()
   endif(TARGET ruy)
 
-  nnfw_find_package(RuySource QUIET)
+  nnas_find_package(RuySource QUIET)
+  nnfw_find_package(CpuInfo QUIET)
 
   if(NOT RuySource_FOUND)
+    message(STATUS "RUY: Source not found")
     set(Ruy_FOUND FALSE PARENT_SCOPE)
     return()
   endif(NOT RuySource_FOUND)
 
-  if(BUILD_RUY)
-    add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/Ruy" ruy)
-  endif(BUILD_RUY)
+  if (NOT CpuInfo_FOUND)
+    message(STATUS "RUY: CPUINFO not found")
+    set(Ruy_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT CpuInfo_FOUND)
 
+  add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/Ruy" ruy)
   set(Ruy_FOUND TRUE PARENT_SCOPE)
-endfunction(_Ruy_import)
+endfunction(_Ruy_Build)
 
-_Ruy_import()
+if(BUILD_RUY)
+  _Ruy_Build()
+else(BUILD_RUY)
+  set(Ruy_FOUND FASLE)
+endif(BUILD_RUY)
diff --git a/infra/nnfw/cmake/packages/RuySourceConfig.cmake b/infra/nnfw/cmake/packages/RuySourceConfig.cmake
deleted file mode 100644
index 08170fb4f..000000000
--- a/infra/nnfw/cmake/packages/RuySourceConfig.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-function(_RuySource_import)
-  if(NOT ${DOWNLOAD_RUY})
-    set(RuySource_DIR FALSE PARENT_SCOPE)
-    return()
-  endif(NOT ${DOWNLOAD_RUY})
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # NOTE Downloads source from latest ruy library (2020-04-10)
-  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(RUY_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/ruy/archive/2e2658f964638ab7aa562d4b48b76007d44e38f0.tar.gz)
-  ExternalSource_Download("ruy" ${RUY_URL})
-
-  set(RuySource_DIR ${ruy_SOURCE_DIR} PARENT_SCOPE)
-  set(RuySource_FOUND ${ruy_SOURCE_GET} PARENT_SCOPE)
-endfunction(_RuySource_import)
-
-_RuySource_import()
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake
index 4cd7610e6..7912e65f3 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake
@@ -17,17 +17,17 @@ if(BUILD_TENSORFLOW_LITE)
   return_unless(AbseilSource_FOUND)
   nnfw_find_package(TensorFlowEigen EXACT 1.13.1 QUIET)
   return_unless(TensorFlowEigen_1_13_1_FOUND)
-  nnfw_find_package(FarmhashSource QUIET)
+  nnas_find_package(FarmhashSource QUIET)
   return_unless(FarmhashSource_FOUND)
   nnfw_find_package(FlatBuffers QUIET)
   return_unless(FlatBuffers_FOUND)
-  nnfw_find_package(GEMMLowpSource QUIET)
+  nnas_find_package(GEMMLowpSource QUIET)
   return_unless(GEMMLowpSource_FOUND)
   nnas_find_package(TensorFlowSource EXACT 1.13.1 QUIET)
   return_unless(TensorFlowSource_FOUND)
 
   # Optional packages
-  nnfw_find_package(NEON2SSESource QUIET)
+  nnas_find_package(NEON2SSESource QUIET)
 
   nnas_include(ExternalProjectTools)
   add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/TensorFlowLite" tflite)
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt
index 20547b92d..616f8ff8e 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt
@@ -110,9 +110,10 @@ list(APPEND TFLITE_INCLUDES "${TFLiteVanillaRuySource_DIR}")
 
 add_library(tensorflow-lite-2.3.0 STATIC ${TFLITE_SRCS})
 target_include_directories(tensorflow-lite-2.3.0 SYSTEM PUBLIC ${TFLITE_INCLUDES})
-target_compile_definitions(tensorflow-lite-2.3.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
+target_include_directories(tensorflow-lite-2.3.0 PRIVATE ${CpuInfoSource_DIR})
+target_compile_definitions(tensorflow-lite-2.3.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV -DRUY_HAVE_CPUINFO")
 set_property(TARGET tensorflow-lite-2.3.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(tensorflow-lite-2.3.0 eigen ${LIB_PTHREAD} dl)
+target_link_libraries(tensorflow-lite-2.3.0 eigen ${LIB_PTHREAD} dl cpuinfo)
 if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
   target_link_libraries(tensorflow-lite-2.3.0 rt)
 endif()
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
index d00ca96a6..9671dc4af 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
@@ -92,6 +92,13 @@ if(BUILD_TENSORFLOW_LITE_2_3_0)
   endif()
   return_unless(TFLiteVanillaRuySource_FOUND)
 
+  nnfw_find_package(CpuInfo QUIET)
+  if (NOT CpuInfo_FOUND)
+    message(STATUS "TFLiteVanillaRun: CPUINFO not found")
+    set(TensorFlowLite_2_3_0_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT CpuInfo_FOUND)
+
   nnas_include(ExternalProjectTools)
   add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/TensorFlowLite-2.3.0" tflite-2.3.0)
 
diff --git a/infra/nnfw/command/count-unittest b/infra/nnfw/command/count-unittest
index 7957f36e7..3ce7bbac3 100644
--- a/infra/nnfw/command/count-unittest
+++ b/infra/nnfw/command/count-unittest
@@ -69,6 +69,6 @@ TOTAL_NEG_TCS=$(echo "$TEST_LIST" | grep '^  neg_' | wc -l)
 TOTAL_POS_TCS=$(echo "$TEST_LIST" | grep '^  neg_' -v | wc -l)
 
 # Report stats
-echo "TOTAL NUMBER OF TEST CASES          : $TOTAL_TCS"
-echo "TOTAL NUMBER OF POSTIVE TEST CASES  : $TOTAL_NEG_TCS"
-echo "TOTAL NUMBER OF NEGATIVE TEST CASES : $TOTAL_POS_TCS"
+printf "TOTAL NUMBER OF TEST CASES          : %5d\n" $TOTAL_TCS
+printf "TOTAL NUMBER OF POSTIVE TEST CASES  : %5d\n" $TOTAL_POS_TCS
+printf "TOTAL NUMBER OF NEGATIVE TEST CASES : %5d\n" $TOTAL_NEG_TCS
diff --git a/infra/nnfw/config/docker.configuration b/infra/nnfw/config/docker.configuration
index 962c02c7f..4716949d7 100644
--- a/infra/nnfw/config/docker.configuration
+++ b/infra/nnfw/config/docker.configuration
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnas}
+DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-nnfw/nnas}
 echo "Using docker image ${DOCKER_IMAGE_NAME}"
 
 if [ -z "`docker images ${DOCKER_IMAGE_NAME}`" ]; then
diff --git a/infra/packaging/preset/20200731_windows b/infra/packaging/preset/20200731_windows
index 65d179eaf..d8d782deb 100644
--- a/infra/packaging/preset/20200731_windows
+++ b/infra/packaging/preset/20200731_windows
@@ -23,13 +23,13 @@ function preset_configure()
   REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
   REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
   REQUIRED_UNITS+=("one-cmds")
+  REQUIRED_UNITS+=("bcq-tools")
 
   NPROC=$(cat /proc/cpuinfo | grep -c processor)
 
   # TODO Use "nncc configure" and "nncc build"
   cmake \
     -G "MSYS Makefiles" \
-    -DTF2NNPKG_FOR_WINDOWS=ON \
     -DUSE_PROTOBUF_LEGACY_IMPORT=ON \
     -DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
     -DCMAKE_SHARED_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
index 7846fd388..db7053a7b 100644
--- a/infra/packaging/res/tf2nnpkg.20200630
+++ b/infra/packaging/res/tf2nnpkg.20200630
@@ -92,11 +92,34 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' '
 
 INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
 
+# Generate BCQ information metadata
+# If model has no BCQ information or invalid information, pb file is not changed.
+"${ROOT}/bin/generate_bcq_metadata" \
+--input_path "${GRAPHDEF_FILE}" \
+--output_path "${TMPDIR}/${MODEL_NAME}_withmeta.pb" \
+--output_arrays "${OUTPUT}"
+
+# Generate BCQ information nodes as output_arrays
+# If model has no BCQ information, output_arrays would be empty.
+"${ROOT}/bin/generate_bcq_output_arrays" \
+--input_path "${TMPDIR}/${MODEL_NAME}_withmeta.pb" \
+--metadata_path "${TMPDIR}/${MODEL_NAME}_metadata_arrays.txt" \
+--output_arrays_path "${TMPDIR}/${MODEL_NAME}_output_arrays.txt"
+
 # generate tflite file
-python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \
---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
---input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \
---output_arrays ${OUTPUT}
+TF2TFLITE_CONVERT_SCRIPT="python ${ROOT}/bin/tf2tfliteV2.py ${TF_INTERFACE} "
+TF2TFLITE_CONVERT_SCRIPT+="--input_path ${TMPDIR}/${MODEL_NAME}_withmeta.pb "
+TF2TFLITE_CONVERT_SCRIPT+="--input_arrays ${INPUT} "
+TF2TFLITE_CONVERT_SCRIPT+="--output_path ${TMPDIR}/${MODEL_NAME}.tflite "
+TF2TFLITE_CONVERT_SCRIPT+="--output_arrays "
+TF2TFLITE_CONVERT_SCRIPT+="$(cat ${TMPDIR}/${MODEL_NAME}_metadata_arrays.txt)"
+TF2TFLITE_CONVERT_SCRIPT+="${OUTPUT}"
+TF2TFLITE_CONVERT_SCRIPT+="$(cat ${TMPDIR}/${MODEL_NAME}_output_arrays.txt) "
+if [ ! -z ${INPUT_SHAPES} ]; then
+  TF2TFLITE_CONVERT_SCRIPT+="--input_shapes ${INPUT_SHAPES} "
+fi
+
+${TF2TFLITE_CONVERT_SCRIPT}
 
 # convert .tflite to .circle
 "${ROOT}/bin/tflite2circle" "${TMPDIR}/${MODEL_NAME}.tflite" "${TMPDIR}/${MODEL_NAME}.tmp.circle"
diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
index 6cdfdf01b..5fd49a46f 100755
--- a/infra/scripts/docker_build_nncc.sh
+++ b/infra/scripts/docker_build_nncc.sh
@@ -54,20 +54,8 @@ pushd $ROOT_PATH > /dev/null
 mkdir -p ${NNCC_INSTALL_PREFIX}
 ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
 
-# create python virtual environment
-./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv"
-
-# TODO remove version number of 'pip==20.2.1 setuptools==49.3.0'
-# NOTE adding version is for temporary hotfix of setuptools 50.x.y version
-./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
-  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-  install -U pip==20.2.1 setuptools==49.3.0
-./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
-  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-  install tensorflow-cpu==2.3.0
-
 mkdir -p ${ARCHIVE_PATH}
-tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} --exclude "bin/venv" ./
-tar -zcf ${ARCHIVE_PATH}/nncc-venv-package.tar.gz -C ${NNCC_INSTALL_PREFIX} bin/venv
+tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} --exclude test ./
+tar -zcf ${ARCHIVE_PATH}/nncc-test-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./test
 
 popd > /dev/null
diff --git a/packaging/cpuinfo.tar.gz b/packaging/cpuinfo.tar.gz
new file mode 100644
index 000000000..ced5debc3
--- /dev/null
+++ b/packaging/cpuinfo.tar.gz
diff --git a/packaging/gemmlowp.tar.gz b/packaging/gemmlowp.tar.gz
index 68339cdb1..198dc1414 100644
--- a/packaging/gemmlowp.tar.gz
+++ b/packaging/gemmlowp.tar.gz
diff --git a/packaging/gtest.tar.gz b/packaging/gtest.tar.gz
index 52cbbcdfe..b8c31918e 100644
--- a/packaging/gtest.tar.gz
+++ b/packaging/gtest.tar.gz
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index 1b8c5fb31..ab77e7c3c 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -1,6 +1,6 @@
 Name:    nnfw
 Summary: nnfw
-Version: 1.9.0
+Version: 1.10.0
 Release: 1
 Group:   Development
 License: Apache-2.0 and MIT and BSD-2-Clause
@@ -12,6 +12,7 @@ Source1002: gtest.tar.gz
 Source1003: eigen.tar.gz
 Source1004: gemmlowp.tar.gz
 Source1005: ruy.tar.gz
+Source1006: cpuinfo.tar.gz
 Source2001: nnfw.pc.in
 Source2002: nnfw-plugin.pc.in
 
@@ -114,9 +115,10 @@ tar -xf %{SOURCE1002} -C ./externals
 tar -xf %{SOURCE1003} -C ./externals
 tar -xf %{SOURCE1004} -C ./externals
 tar -xf %{SOURCE1005} -C ./externals
+tar -xf %{SOURCE1006} -C ./externals
 
 %build
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 # runtime build
 %{build_env} ./nnfw configure %{build_options} %{extra_option}
 %{build_env} ./nnfw build -j4
@@ -137,7 +139,7 @@ tar -zcf test-suite.tar.gz infra/scripts
 %endif # arm armv7l aarch64
 
 %install
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 
 mkdir -p %{buildroot}%{_libdir}
 mkdir -p %{buildroot}%{_bindir}
@@ -181,14 +183,14 @@ install -m 0644 ./tests/scripts/build_path.txt %{buildroot}%{test_install_dir}/t
 %files
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 %{_libdir}/*.so
 %endif
 
 %files devel
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 %dir %{_includedir}/nnfw
 %{_includedir}/nnfw/*
 %{_libdir}/pkgconfig/nnfw.pc
@@ -197,13 +199,13 @@ install -m 0644 ./tests/scripts/build_path.txt %{buildroot}%{test_install_dir}/t
 %files plugin-devel
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 %dir %{_includedir}/onert
 %{_includedir}/onert/*
 %{_libdir}/pkgconfig/nnfw-plugin.pc
 %endif
 
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 %files minimal-app
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
@@ -214,7 +216,7 @@ install -m 0644 ./tests/scripts/build_path.txt %{buildroot}%{test_install_dir}/t
 %files test
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64
+%ifarch arm armv7l aarch64 x86_64
 %dir %{test_install_home}
 %{test_install_home}/*
 %endif # arm armv7l aarch64
diff --git a/packaging/ruy.tar.gz b/packaging/ruy.tar.gz
index 98d1a1e33..9ad14fe6c 100644
--- a/packaging/ruy.tar.gz
+++ b/packaging/ruy.tar.gz
diff --git a/res/TensorFlowLiteRecipes/Add_002/test.recipe b/res/TensorFlowLiteRecipes/Add_002/test.recipe
new file mode 100644
index 000000000..12ba8000b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Add_002/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+    arg: "4" arg: "-2" arg: "3" arg: "-1"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+}
+operation {
+  type: "Add"
+  input: "ifm1"
+  input: "ifm2"
+  output: "ofm"
+  add_options {
+    activation: NONE
+  }
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Add_002/test.reverse b/res/TensorFlowLiteRecipes/Add_002/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Add_002/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Concatenation_001/test.recipe b/res/TensorFlowLiteRecipes/Concatenation_001/test.recipe
new file mode 100644
index 000000000..211976c8c
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Concatenation_001/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 1 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 2 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+}
+operation {
+  type: "Concatenation"
+  concatenation_options {
+    axis: 3
+    activation: NONE
+  }
+  input: "ifm1"
+  input: "ifm2"
+  output: "ofm"
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Concatenation_001/test.reverse b/res/TensorFlowLiteRecipes/Concatenation_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Concatenation_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Dequantize_000/test.recipe b/res/TensorFlowLiteRecipes/Dequantize_000/test.recipe
new file mode 100644
index 000000000..bbd3220c9
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Dequantize_000/test.recipe
@@ -0,0 +1,18 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 4 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operation {
+  type: "Dequantize"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Dequantize_000/test.reverse b/res/TensorFlowLiteRecipes/Dequantize_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Dequantize_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_004/test.recipe b/res/TensorFlowLiteRecipes/FullyConnected_004/test.recipe
new file mode 100644
index 000000000..b89eabeeb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FullyConnected_004/test.recipe
@@ -0,0 +1,69 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operand {
+  name: "weight"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "0" arg: "0" arg: "4"
+    arg: "2" arg: "3" arg: "0" arg: "0"
+    arg: "5" arg: "0" arg: "0" arg: "6"
+  }
+  sparsity {
+      traversal_order { dim: 0 dim: 1 dim: 2 dim: 3 }
+      block_map { dim: 0 dim: 1 }
+      dim_metadata {
+          format: DENSE
+          dense_size: 2
+      }
+      dim_metadata {
+          format: SPARSE_CSR
+          array_segments {
+              dim: 0 dim: 2 dim: 3
+              type: UINT8VEC
+          }
+          array_indices {
+              dim: 0 dim: 1 dim: 1
+              type: UINT8VEC
+          }
+      }
+      dim_metadata {
+          format: DENSE
+          dense_size: 2
+      }
+      dim_metadata {
+          format: DENSE
+          dense_size: 2
+      }
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "-2" arg: "-3" arg: "4"
+  }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: NONE
+  }
+  input: "in"
+  input: "weight"
+  input: "bias"
+  output: "out"
+}
+input: "in"
+output: "out"
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_004/test.reverse b/res/TensorFlowLiteRecipes/FullyConnected_004/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FullyConnected_004/test.reverse
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_005/test.recipe b/res/TensorFlowLiteRecipes/FullyConnected_005/test.recipe
new file mode 100644
index 000000000..0aa1dfa77
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FullyConnected_005/test.recipe
@@ -0,0 +1,43 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operand {
+  name: "weight"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "0" arg: "2" arg: "3"
+    arg: "0" arg: "4" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "5" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "6"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "-2" arg: "-3" arg: "4"
+  }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: NONE
+  }
+  input: "in"
+  input: "weight"
+  input: "bias"
+  output: "out"
+}
+input: "in"
+output: "out"
diff --git a/res/TensorFlowLiteRecipes/LogSoftmax_U8_000/test.recipe b/res/TensorFlowLiteRecipes/LogSoftmax_U8_000/test.recipe
new file mode 100644
index 000000000..d960567e8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/LogSoftmax_U8_000/test.recipe
@@ -0,0 +1,21 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  quant { min: -4.952 max: 4.939 scale: 0.0388 zero_point: 128 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  quant { min: -15.9375 max: 0 scale: 0.0625 zero_point: 255 }
+}
+operation {
+  type: "LogSoftmax"
+  log_softmax_options {
+  }
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/LogSoftmax_U8_000/test.reverse b/res/TensorFlowLiteRecipes/LogSoftmax_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/LogSoftmax_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Mul_001/test.recipe b/res/TensorFlowLiteRecipes/Mul_001/test.recipe
new file mode 100644
index 000000000..18c19ff19
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Mul_001/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+    arg: "4" arg: "-2" arg: "3" arg: "-1"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 3 }
+}
+operation {
+  type: "Mul"
+  input: "ifm1"
+  input: "ifm2"
+  output: "ofm"
+  mul_options {
+    activation: NONE
+  }
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Mul_001/test.reverse b/res/TensorFlowLiteRecipes/Mul_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Mul_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_Add_000/test.recipe b/res/TensorFlowLiteRecipes/Net_TConv_Add_000/test.recipe
new file mode 100644
index 000000000..b3247f24f
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_Add_000/test.recipe
@@ -0,0 +1,113 @@
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 2
+  }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "0.1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Addition"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Addition_add_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "-2.04724"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+    dim: 2
+    dim: 2
+  }
+  quant {
+    min: 0
+    max: 255
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "input_size"
+  type: INT32
+  shape {
+    dim: 4
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "4"
+    arg: "4"
+    arg: "1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operation {
+  type: "TransposeConv"
+  input: "input_size"
+  input: "filter"
+  input: "Hole"
+  output: "conv2d_transpose"
+  transpose_conv_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+}
+operation {
+  type: "Add"
+  input: "conv2d_transpose"
+  input: "Addition_add_param"
+  output: "Addition"
+  add_options {
+    activation: NONE
+  }
+}
+input: "Hole"
+output: "Addition"
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_Add_000/test.rule b/res/TensorFlowLiteRecipes/Net_TConv_Add_000/test.rule
new file mode 100644
index 000000000..894d642a3
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_Add_000/test.rule
@@ -0,0 +1,6 @@
+# To check if Add op is fused to Transposed Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "TCONV_EXIST"             $(op_count TRANSPOSE_CONV) '=' 1
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_Add_001/test.recipe b/res/TensorFlowLiteRecipes/Net_TConv_Add_001/test.recipe
new file mode 100644
index 000000000..89a344f0e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_Add_001/test.recipe
@@ -0,0 +1,119 @@
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 2
+  }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "0.1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Addition"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Addition_add_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "2" arg: "3" arg: "4"
+    arg: "-1" arg: "-2" arg: "-3" arg: "-4"
+    arg: "1" arg: "2" arg: "3" arg: "4"
+    arg: "-1" arg: "-2" arg: "-3" arg: "-4"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+    dim: 2
+    dim: 2
+  }
+  quant {
+    min: 0
+    max: 255
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "input_size"
+  type: INT32
+  shape {
+    dim: 4
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "4"
+    arg: "4"
+    arg: "1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operation {
+  type: "TransposeConv"
+  input: "input_size"
+  input: "filter"
+  input: "Hole"
+  output: "conv2d_transpose"
+  transpose_conv_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+}
+operation {
+  type: "Add"
+  input: "conv2d_transpose"
+  input: "Addition_add_param"
+  output: "Addition"
+  add_options {
+    activation: NONE
+  }
+}
+input: "Hole"
+output: "Addition"
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_Add_001/test.rule b/res/TensorFlowLiteRecipes/Net_TConv_Add_001/test.rule
new file mode 100644
index 000000000..86afc47f6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_Add_001/test.rule
@@ -0,0 +1,6 @@
+# To check if Add op is not fused to Transposed Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "TCONV_EXIST"             $(op_count TRANSPOSE_CONV) '=' 1
+RULE    "NO_FUSION"               $(op_count ADD) '=' 1
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_Add_002/test.recipe b/res/TensorFlowLiteRecipes/Net_TConv_Add_002/test.recipe
new file mode 100644
index 000000000..cfea30653
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_Add_002/test.recipe
@@ -0,0 +1,113 @@
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 2
+  }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "0.1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Addition"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Addition_add_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "-2.04724"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+    dim: 2
+    dim: 2
+  }
+  quant {
+    min: 0
+    max: 255
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "input_size"
+  type: INT32
+  shape {
+    dim: 4
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "4"
+    arg: "4"
+    arg: "1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operation {
+  type: "TransposeConv"
+  input: "input_size"
+  input: "filter"
+  input: "Hole"
+  output: "conv2d_transpose"
+  transpose_conv_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+}
+operation {
+  type: "Add"
+  input: "Addition_add_param"
+  input: "conv2d_transpose"
+  output: "Addition"
+  add_options {
+    activation: NONE
+  }
+}
+input: "Hole"
+output: "Addition"
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_Add_002/test.rule b/res/TensorFlowLiteRecipes/Net_TConv_Add_002/test.rule
new file mode 100644
index 000000000..894d642a3
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_Add_002/test.rule
@@ -0,0 +1,6 @@
+# To check if Add op is fused to Transposed Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "TCONV_EXIST"             $(op_count TRANSPOSE_CONV) '=' 1
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_BN_001/test.recipe b/res/TensorFlowLiteRecipes/Net_TConv_BN_001/test.recipe
new file mode 100644
index 000000000..babf5af4e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_BN_001/test.recipe
@@ -0,0 +1,149 @@
+operand {
+  name: "Const_transposed"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 2
+  }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "0.1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_add_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "-2.04724"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_mul_0"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_mul_0_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "2.00834"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+    dim: 2
+    dim: 2
+  }
+  quant {
+    min: 0
+    max: 255
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose/input_sizes"
+  type: INT32
+  shape {
+    dim: 4
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "4"
+    arg: "4"
+    arg: "1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operation {
+  type: "TransposeConv"
+  input: "conv2d_transpose/input_sizes"
+  input: "Const_transposed"
+  input: "Hole"
+  output: "conv2d_transpose"
+  transpose_conv_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+}
+operation {
+  type: "Mul"
+  input: "conv2d_transpose"
+  input: "FusedBatchNormV3_mul_0_param"
+  output: "FusedBatchNormV3_mul_0"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "FusedBatchNormV3_mul_0"
+  input: "FusedBatchNormV3_add_param"
+  output: "FusedBatchNormV3"
+  add_options {
+    activation: NONE
+  }
+}
+input: "Hole"
+output: "FusedBatchNormV3"
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_BN_001/test.rule b/res/TensorFlowLiteRecipes/Net_TConv_BN_001/test.rule
new file mode 100644
index 000000000..0988ecf28
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_BN_001/test.rule
@@ -0,0 +1,7 @@
+# To check if BatchNorm op(mul + add) is fused to Transposed Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "TCONV_EXIST"             $(op_count TRANSPOSE_CONV) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/PRelu_001/test.recipe b/res/TensorFlowLiteRecipes/PRelu_001/test.recipe
new file mode 100644
index 000000000..c18acdbbc
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/PRelu_001/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+}
+operand {
+  name: "alpha"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "0.1" arg: "0.3" arg: "0.5"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+}
+operation {
+  type: "PRelu"
+  input: "ifm"
+  input: "alpha"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/PRelu_001/test.reverse b/res/TensorFlowLiteRecipes/PRelu_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/PRelu_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_000/test.recipe b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_000/test.recipe
new file mode 100644
index 000000000..773d44343
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_000/test.recipe
@@ -0,0 +1,185 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 3 dim: 1 dim: 2 }
+}
+operand {
+  name: "input_to_input_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 2 }
+}
+operand {
+  name: "input_to_forget_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 2 }
+}
+operand {
+  name: "input_to_cell_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 2 }
+}
+operand {
+  name: "input_to_output_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 2 }
+}
+operand {
+  name: "recurrent_to_input_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "recurrent_to_forget_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "recurrent_to_cell_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "recurrent_to_output_weights"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "cell_to_input_weights"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "cell_to_forget_weights"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "cell_to_output_weights"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "input_gate_bias"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "forget_gate_bias"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "cell_gate_bias"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "output_gate_bias"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "projection_weight"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "projection_bias"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "activation_state"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operand {
+  name: "cell_state"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operand {
+  name: "input_layer_norm_coefficients"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "forget_layer_norm_coefficients"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "cell_layer_norm_coefficients"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "output_layer_norm_coefficients"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 3 dim: 1 dim: 4 }
+}
+operation {
+  type: "UnidirectionalSequenceLSTM"
+  unidirectional_sequence_lstm_options {
+    activation: NONE
+    cell_clip: 0.0
+    proj_clip: 0.0
+    time_major: false
+    asymmetric_quantize_inputs: false
+  }
+  input: "ifm"
+  input: "input_to_input_weights"
+  input: "input_to_forget_weights"
+  input: "input_to_cell_weights"
+  input: "input_to_output_weights"
+  input: "recurrent_to_input_weights"
+  input: "recurrent_to_forget_weights"
+  input: "recurrent_to_cell_weights"
+  input: "recurrent_to_output_weights"
+  input: "cell_to_input_weights"
+  input: "cell_to_forget_weights"
+  input: "cell_to_output_weights"
+  input: "input_gate_bias"
+  input: "forget_gate_bias"
+  input: "cell_gate_bias"
+  input: "output_gate_bias"
+  input: "projection_weight"
+  input: "projection_bias"
+  input: "activation_state"
+  input: "cell_state"
+  input: "input_layer_norm_coefficients"
+  input: "forget_layer_norm_coefficients"
+  input: "cell_layer_norm_coefficients"
+  input: "output_layer_norm_coefficients"
+  output: "ofm"
+}
+input: "ifm"
+input: "input_to_input_weights"
+input: "input_to_forget_weights"
+input: "input_to_cell_weights"
+input: "input_to_output_weights"
+input: "recurrent_to_input_weights"
+input: "recurrent_to_forget_weights"
+input: "recurrent_to_cell_weights"
+input: "recurrent_to_output_weights"
+input: "cell_to_input_weights"
+input: "cell_to_forget_weights"
+input: "cell_to_output_weights"
+input: "input_gate_bias"
+input: "forget_gate_bias"
+input: "cell_gate_bias"
+input: "output_gate_bias"
+input: "projection_weight"
+input: "projection_bias"
+input: "activation_state"
+input: "cell_state"
+input: "input_layer_norm_coefficients"
+input: "forget_layer_norm_coefficients"
+input: "cell_layer_norm_coefficients"
+input: "output_layer_norm_coefficients"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_000/test.reverse b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_001/test.recipe b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_001/test.recipe
new file mode 100644
index 000000000..5938cc115
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_001/test.recipe
@@ -0,0 +1,323 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 28 dim: 28 }
+}
+operand {
+  name: "input_to_input_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 28 }
+  filler {
+    tag: "explicit"
+    arg: "0.1687648445367813" arg: "0.04799923673272133" arg: "0.195631742477417" arg: "0.10485544055700302" arg: "0.018675213679671288" arg: "0.13739116489887238" arg: "0.0898093432188034" arg: "-0.28823068737983704" arg: "-0.02585843950510025" arg: "0.05994327738881111" arg: "0.07523486018180847" arg: "0.0797467827796936" arg: "0.3736445903778076" arg: "0.6627118587493896" arg: "0.3780449628829956" arg: "0.36176905035972595" arg: "-0.2041059285402298" arg: "0.1464163213968277" arg: "0.4136067032814026" arg: "0.1049080342054367" arg: "0.11873452365398407" arg: "-0.05727154389023781" arg: "-0.04963447153568268" arg: "-0.332282155752182" arg: "0.07995595782995224" arg: "-0.20255199074745178" arg: "-0.05633578822016716" arg: "0.11420387774705887"
+    arg: "-0.029032165184617043" arg: "0.0007604139973409474" arg: "-0.31187641620635986" arg: "0.1813918948173523" arg: "-0.31930115818977356" arg: "0.05086275562644005" arg: "-0.029340023174881935" arg: "0.039944298565387726" arg: "0.002410847693681717" arg: "-0.32042407989501953" arg: "0.22277581691741943" arg: "0.44808948040008545" arg: "0.439309686422348" arg: "0.3975866138935089" arg: "0.30034393072128296" arg: "0.1280607134103775" arg: "-0.06160789355635643" arg: "-0.09105906635522842" arg: "-0.23636692762374878" arg: "-0.3308735489845276" arg: "-0.572564423084259" arg: "-0.6935749053955078" arg: "-0.5299585461616516" arg: "-0.27702340483665466" arg: "-0.7308681011199951" arg: "-0.6580930352210999" arg: "-0.4219287633895874" arg: "0.20912277698516846"
+    arg: "0.060638219118118286" arg: "-0.12734581530094147" arg: "0.14102879166603088" arg: "-0.2901698052883148" arg: "-0.24771066009998322" arg: "-0.3001070022583008" arg: "-0.22395247220993042" arg: "-0.4311608672142029" arg: "-0.17570453882217407" arg: "0.08624797314405441" arg: "0.018354324623942375" arg: "0.5221205949783325" arg: "0.14471562206745148" arg: "0.03486153483390808" arg: "0.16570599377155304" arg: "0.16822484135627747" arg: "0.7482292056083679" arg: "0.7351004481315613" arg: "0.27890756726264954" arg: "0.5524212121963501" arg: "0.23095451295375824" arg: "0.4314809739589691" arg: "0.3363182246685028" arg: "0.6496651768684387" arg: "0.5523284077644348" arg: "0.4055400490760803" arg: "0.4644913971424103" arg: "0.09175155311822891"
+    arg: "0.21605326235294342" arg: "0.08236871659755707" arg: "-0.17103230953216553" arg: "-0.18498405814170837" arg: "-0.5800216794013977" arg: "-0.487117737531662" arg: "-0.7577510476112366" arg: "-0.4389793574810028" arg: "-0.6259695291519165" arg: "-0.6731855273246765" arg: "-0.5440476536750793" arg: "-0.4302021861076355" arg: "-0.28402388095855713" arg: "-0.22358544170856476" arg: "-0.30889278650283813" arg: "0.12872418761253357" arg: "0.03388393670320511" arg: "0.034014951437711716" arg: "0.2464621216058731" arg: "-0.08322205394506454" arg: "0.20342972874641418" arg: "-0.14718593657016754" arg: "0.2358621209859848" arg: "0.2880614697933197" arg: "0.08535374701023102" arg: "-0.09865032881498337" arg: "-0.44678133726119995" arg: "-0.11892740428447723"
+    arg: "0.1754414290189743" arg: "0.24821344017982483" arg: "0.10287455469369888" arg: "0.2701801657676697" arg: "0.41744735836982727" arg: "0.19831198453903198" arg: "0.46229296922683716" arg: "0.16203390061855316" arg: "0.12368197739124298" arg: "-0.27287790179252625" arg: "0.014193423092365265" arg: "0.11754778772592545" arg: "-0.05268847197294235" arg: "0.017977338284254074" arg: "0.381906121969223" arg: "0.17318789660930634" arg: "0.1804313063621521" arg: "-0.021003693342208862" arg: "0.5185583829879761" arg: "0.5880140662193298" arg: "0.4118947982788086" arg: "0.0662691667675972" arg: "0.08239153027534485" arg: "0.04901377111673355" arg: "0.12195281684398651" arg: "0.46632856130599976" arg: "0.18819667398929596" arg: "0.18338985741138458"
+    arg: "0.1751364767551422" arg: "0.04932933673262596" arg: "0.10589705407619476" arg: "0.02743552438914776" arg: "0.15879607200622559" arg: "0.3464271128177643" arg: "-0.08866819739341736" arg: "0.10017956793308258" arg: "0.14074550569057465" arg: "0.07490764558315277" arg: "0.34844619035720825" arg: "0.27873194217681885" arg: "0.4934106469154358" arg: "0.6267357468605042" arg: "0.653296709060669" arg: "0.6742697358131409" arg: "0.09091877192258835" arg: "0.08090687543153763" arg: "0.007206875365227461" arg: "0.06743039935827255" arg: "-0.12605983018875122" arg: "0.15661095082759857" arg: "0.24821621179580688" arg: "0.12353820353746414" arg: "0.1812787503004074" arg: "-0.32736217975616455" arg: "-0.0818324014544487" arg: "-0.15149752795696259"
+    arg: "-0.24534951150417328" arg: "0.3147708773612976" arg: "-0.008836879394948483" arg: "-0.1994941234588623" arg: "-0.049111880362033844" arg: "-0.20707322657108307" arg: "-0.1682814508676529" arg: "-0.010347025468945503" arg: "-0.5228656530380249" arg: "-0.5135005712509155" arg: "-0.3480781614780426" arg: "-0.38772332668304443" arg: "-0.4124152958393097" arg: "-0.39354074001312256" arg: "-0.4979104995727539" arg: "-0.19147707521915436" arg: "0.0627184733748436" arg: "0.46287989616394043" arg: "0.07998469471931458" arg: "0.0790494829416275" arg: "0.09622958302497864" arg: "0.014114361256361008" arg: "-0.1053328886628151" arg: "0.03503574803471565" arg: "0.16327714920043945" arg: "0.4359706938266754" arg: "0.05958576500415802" arg: "-0.6027227640151978"
+    arg: "0.057672709226608276" arg: "-0.036423951387405396" arg: "-0.10990197956562042" arg: "0.0010471289278939366" arg: "0.16711515188217163" arg: "0.32950177788734436" arg: "0.15729546546936035" arg: "0.10592831671237946" arg: "-0.3053370416164398" arg: "-0.06242264807224274" arg: "-0.005419928580522537" arg: "0.2704862654209137" arg: "0.4702851474285126" arg: "-0.07878115773200989" arg: "-0.23939359188079834" arg: "-0.046783994883298874" arg: "0.2417677342891693" arg: "0.07949794828891754" arg: "-0.01079419907182455" arg: "-0.20259179174900055" arg: "-0.3341178894042969" arg: "-0.06668252497911453" arg: "-0.4118092954158783" arg: "-0.25006234645843506" arg: "-0.2406432181596756" arg: "-0.2872427701950073" arg: "-0.33487430214881897" arg: "0.2951969802379608"
+    arg: "0.25783705711364746" arg: "0.03347025439143181" arg: "0.03592922165989876" arg: "-0.14409473538398743" arg: "-0.2313976287841797" arg: "-0.05575039982795715" arg: "-0.3067474663257599" arg: "-0.3740054965019226" arg: "-0.173713818192482" arg: "-0.5401638746261597" arg: "-0.26812097430229187" arg: "-0.2669502794742584" arg: "0.0020368772093206644" arg: "0.5259052515029907" arg: "0.12416208535432816" arg: "0.295955628156662" arg: "0.17083768546581268" arg: "0.3295002579689026" arg: "0.1629410982131958" arg: "0.6651543378829956" arg: "0.3124505877494812" arg: "0.20944754779338837" arg: "0.22139877080917358" arg: "0.09963700920343399" arg: "0.14622727036476135" arg: "0.24759505689144135" arg: "0.29243841767311096" arg: "0.3123530149459839"
+    arg: "-0.14453744888305664" arg: "-0.22069278359413147" arg: "0.2322058528661728" arg: "0.03341517224907875" arg: "0.1755218654870987" arg: "-0.00033268495462834835" arg: "-0.17320746183395386" arg: "-0.013570177368819714" arg: "-0.20040804147720337" arg: "-0.22750437259674072" arg: "-0.22218504548072815" arg: "-0.20970220863819122" arg: "-0.1999887377023697" arg: "-0.18349596858024597" arg: "-0.12978042662143707" arg: "0.34552574157714844" arg: "0.10098802298307419" arg: "-0.1387794464826584" arg: "0.01755024679005146" arg: "0.037359848618507385" arg: "0.1385539323091507" arg: "0.5478288531303406" arg: "0.4302785396575928" arg: "0.31248337030410767" arg: "0.4476741850376129" arg: "0.4958922266960144" arg: "0.3047107756137848" arg: "-0.1087426096200943"
+    arg: "-0.2650264799594879" arg: "0.001212756964378059" arg: "-0.1455630362033844" arg: "-0.06888622045516968" arg: "-0.39034101366996765" arg: "-0.15759187936782837" arg: "-0.0332697331905365" arg: "-0.20074871182441711" arg: "0.04032357037067413" arg: "-0.15028415620326996" arg: "-0.094477079808712" arg: "-0.08076327294111252" arg: "0.2434137463569641" arg: "0.6036377549171448" arg: "0.2083856165409088" arg: "0.421690434217453" arg: "0.31900280714035034" arg: "0.4197741746902466" arg: "0.168703094124794" arg: "-0.04694703593850136" arg: "0.016685811802744865" arg: "-0.24497397243976593" arg: "-0.12784908711910248" arg: "0.04392942041158676" arg: "0.43009427189826965" arg: "0.1300940066576004" arg: "-0.006342500913888216" arg: "0.28684887290000916"
+    arg: "0.2645731270313263" arg: "-0.06917668879032135" arg: "0.04553110525012016" arg: "0.0005655331769958138" arg: "0.02004513144493103" arg: "-0.12128561735153198" arg: "0.23945191502571106" arg: "0.011923068203032017" arg: "0.049406301230192184" arg: "0.10682254284620285" arg: "-0.15404820442199707" arg: "0.04071756824851036" arg: "-0.04888581857085228" arg: "0.03098251111805439" arg: "-0.11950475722551346" arg: "-0.47736215591430664" arg: "-0.5907397270202637" arg: "-0.5694231390953064" arg: "-0.3115452826023102" arg: "-0.3244591951370239" arg: "-0.44562792778015137" arg: "-0.2824826240539551" arg: "-0.608927845954895" arg: "-0.5112527012825012" arg: "-0.5793758630752563" arg: "-0.7528161406517029" arg: "-0.10694423317909241" arg: "0.03750178962945938"
+    arg: "-0.09262563288211823" arg: "0.33829057216644287" arg: "0.19356343150138855" arg: "-0.32529792189598083" arg: "-0.11209913343191147" arg: "-0.17982369661331177" arg: "-0.02031439170241356" arg: "-0.04840415343642235" arg: "0.2679027318954468" arg: "-0.0351799800992012" arg: "0.22578150033950806" arg: "0.024941330775618553" arg: "-0.22684139013290405" arg: "0.0645766407251358" arg: "0.45471301674842834" arg: "0.006177396513521671" arg: "-0.07253464311361313" arg: "-0.031976472586393356" arg: "-0.1588464081287384" arg: "-0.35738927125930786" arg: "-0.2679489850997925" arg: "0.13583332300186157" arg: "0.6051817536354065" arg: "0.7277238368988037" arg: "0.7886832356452942" arg: "0.30304884910583496" arg: "0.23822274804115295" arg: "-0.21040984988212585"
+    arg: "0.13347174227237701" arg: "-0.018098508939146996" arg: "0.3020147383213043" arg: "0.36663228273391724" arg: "0.19701610505580902" arg: "0.32974785566329956" arg: "0.4301891624927521" arg: "0.07097901403903961" arg: "0.3667917549610138" arg: "0.3058207929134369" arg: "0.047205567359924316" arg: "0.19449062645435333" arg: "0.40699872374534607" arg: "0.04719206318259239" arg: "0.13794705271720886" arg: "-0.12094765901565552" arg: "-0.11291661113500595" arg: "0.0622805655002594" arg: "0.202839195728302" arg: "0.3466202914714813" arg: "0.2170470952987671" arg: "0.3939531445503235" arg: "0.5407551527023315" arg: "0.43948447704315186" arg: "0.3990897834300995" arg: "0.0408027209341526" arg: "-0.1760343313217163" arg: "0.014698908664286137"
+    arg: "-0.04886699095368385" arg: "0.021053045988082886" arg: "0.5374348759651184" arg: "0.08056395500898361" arg: "-0.22925740480422974" arg: "-0.3281041383743286" arg: "-0.07999913394451141" arg: "-0.021695485338568687" arg: "0.011688797734677792" arg: "0.03478331118822098" arg: "0.2215108573436737" arg: "0.20613346993923187" arg: "0.06630691885948181" arg: "0.1218089833855629" arg: "0.11808548867702484" arg: "0.45628872513771057" arg: "0.2919962406158447" arg: "0.14256659150123596" arg: "0.175963893532753" arg: "0.04495575651526451" arg: "0.2504587471485138" arg: "0.026869049295783043" arg: "0.47956186532974243" arg: "0.02193787880241871" arg: "-0.4510112404823303" arg: "-0.313761830329895" arg: "-0.1106211319565773" arg: "0.2789842486381531"
+    arg: "0.06086614355444908" arg: "0.258357435464859" arg: "-0.21907173097133636" arg: "0.3981928527355194" arg: "0.4303799867630005" arg: "0.3879024386405945" arg: "0.43367868661880493" arg: "0.4597713351249695" arg: "0.45437106490135193" arg: "-0.02416928857564926" arg: "-0.05651269108057022" arg: "0.28281864523887634" arg: "0.0635884702205658" arg: "0.2554715573787689" arg: "0.242105171084404" arg: "0.4487742483615875" arg: "0.31888487935066223" arg: "-0.004678715486079454" arg: "-0.2514997124671936" arg: "-0.0040764473378658295" arg: "-0.32342639565467834" arg: "-0.05584603548049927" arg: "-0.06270022690296173" arg: "-0.036144010722637177" arg: "-0.15446072816848755" arg: "0.23341621458530426" arg: "0.33822396397590637" arg: "0.05738767236471176"
+    arg: "0.10699111223220825" arg: "-0.03672357276082039" arg: "0.05017268285155296" arg: "0.0031773506198078394" arg: "0.21564021706581116" arg: "-0.016393177211284637" arg: "0.04495749995112419" arg: "0.08746970444917679" arg: "0.2627675533294678" arg: "-0.06982193887233734" arg: "0.16397050023078918" arg: "0.12711408734321594" arg: "0.1515779048204422" arg: "0.5583046674728394" arg: "0.6618807911872864" arg: "0.6948606967926025" arg: "0.7728397846221924" arg: "0.5394765734672546" arg: "0.21606062352657318" arg: "-0.045014407485723495" arg: "-0.00790402665734291" arg: "0.05512310191988945" arg: "-0.10717213153839111" arg: "0.12510299682617188" arg: "-0.2419642060995102" arg: "-0.5152121782302856" arg: "-0.22660385072231293" arg: "-0.2283792793750763"
+    arg: "-0.16621996462345123" arg: "-0.15625415742397308" arg: "0.4551774561405182" arg: "-0.03949269279837608" arg: "-0.012321516871452332" arg: "-0.0024175785947591066" arg: "-0.2590198516845703" arg: "-0.05285951495170593" arg: "-0.2730681002140045" arg: "-0.39435532689094543" arg: "-0.23795633018016815" arg: "-0.1415511518716812" arg: "0.3277718722820282" arg: "0.18961961567401886" arg: "0.4701646864414215" arg: "0.7281239032745361" arg: "0.25955408811569214" arg: "0.18941733241081238" arg: "0.06163005158305168" arg: "0.008561286143958569" arg: "0.03527892380952835" arg: "0.04559394717216492" arg: "0.12878121435642242" arg: "0.453266441822052" arg: "0.2222425490617752" arg: "-0.10644187033176422" arg: "-0.2403181493282318" arg: "-0.3840152323246002"
+    arg: "-0.10421296954154968" arg: "0.0513484813272953" arg: "0.043097011744976044" arg: "-0.22645732760429382" arg: "-0.13321255147457123" arg: "0.16552734375" arg: "0.3591425120830536" arg: "0.27922341227531433" arg: "0.02332492358982563" arg: "-0.1489277184009552" arg: "0.03313438966870308" arg: "0.6452564001083374" arg: "0.7131020426750183" arg: "0.3456207513809204" arg: "0.08521326631307602" arg: "0.3773330748081207" arg: "-0.06791231781244278" arg: "-0.042457882314920425" arg: "-0.2748822271823883" arg: "-0.45328542590141296" arg: "-0.13675789535045624" arg: "-0.35200127959251404" arg: "-0.3977891802787781" arg: "-0.22524717450141907" arg: "-0.5592325329780579" arg: "-0.6521109342575073" arg: "-0.4787251651287079" arg: "0.4251031279563904"
+    arg: "-0.10961339622735977" arg: "0.08067493140697479" arg: "0.19314810633659363" arg: "0.15304067730903625" arg: "0.08039616048336029" arg: "-0.12978368997573853" arg: "0.23539943993091583" arg: "-0.0029274635016918182" arg: "0.19411355257034302" arg: "0.019054999575018883" arg: "0.19173188507556915" arg: "-0.09392274171113968" arg: "0.38477426767349243" arg: "0.04760168865323067" arg: "0.3185006082057953" arg: "0.5135385394096375" arg: "0.10360299050807953" arg: "0.23731642961502075" arg: "0.30989235639572144" arg: "0.22052974998950958" arg: "0.3194608986377716" arg: "0.5341771245002747" arg: "0.518899142742157" arg: "0.23889583349227905" arg: "0.30273208022117615" arg: "-0.019300086423754692" arg: "0.039211057126522064" arg: "-0.10289957374334335"
+  }
+}
+operand {
+  name: "input_to_forget_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 28 }
+  filler {
+    tag: "explicit"
+    arg: "-0.07628004252910614" arg: "-0.10215829312801361" arg: "-0.05716871842741966" arg: "0.10802043229341507" arg: "0.17770273983478546" arg: "0.2961052358150482" arg: "0.20247122645378113" arg: "0.2446555495262146" arg: "0.040852244943380356" arg: "-0.11335061490535736" arg: "0.10778427124023438" arg: "0.02703988179564476" arg: "0.10350820422172546" arg: "0.022522294893860817" arg: "-0.2625943720340729" arg: "-0.4493379592895508" arg: "-0.3724125623703003" arg: "-0.0114969527348876" arg: "0.3713493049144745" arg: "0.15514639019966125" arg: "0.02222340926527977" arg: "-0.1641593724489212" arg: "-0.1006893664598465" arg: "-0.36185842752456665" arg: "-0.29800334572792053" arg: "-0.1352705955505371" arg: "-0.1678757518529892" arg: "-0.04674749821424484"
+    arg: "0.1935243457555771" arg: "-0.3558758497238159" arg: "-0.01225559413433075" arg: "0.17600563168525696" arg: "-0.2398902177810669" arg: "0.021266596391797066" arg: "-0.012361799366772175" arg: "-0.07168065011501312" arg: "-0.046216897666454315" arg: "0.2807506322860718" arg: "0.31839627027511597" arg: "0.22673124074935913" arg: "-0.1492728590965271" arg: "-0.22067101299762726" arg: "-0.13988566398620605" arg: "-0.26826149225234985" arg: "-0.12911173701286316" arg: "-0.25909167528152466" arg: "-0.09522391110658646" arg: "-0.2712244987487793" arg: "-0.0641704648733139" arg: "-0.06269702315330505" arg: "0.05739550665020943" arg: "-0.024462971836328506" arg: "-0.07414258271455765" arg: "-0.1316741555929184" arg: "0.2740451693534851" arg: "0.008587119169533253"
+    arg: "0.09579362720251083" arg: "-0.10534976422786713" arg: "-0.20309729874134064" arg: "0.05385243892669678" arg: "-0.12469331920146942" arg: "0.19276906549930573" arg: "-0.14719951152801514" arg: "0.04517911374568939" arg: "0.17648524045944214" arg: "-0.07139024138450623" arg: "0.3134388327598572" arg: "0.30480578541755676" arg: "-0.20624959468841553" arg: "-0.28569987416267395" arg: "-0.463299423456192" arg: "-0.22555772960186005" arg: "0.08225655555725098" arg: "0.22282175719738007" arg: "0.2178572118282318" arg: "0.1534086912870407" arg: "0.0010919382330030203" arg: "-0.049574099481105804" arg: "-0.14441633224487305" arg: "0.10646001249551773" arg: "-0.25545066595077515" arg: "-0.1575625091791153" arg: "0.022244272753596306" arg: "-0.13968679308891296"
+    arg: "-0.021634351462125778" arg: "0.08071571588516235" arg: "0.1013035699725151" arg: "0.016050167381763458" arg: "0.2625211775302887" arg: "0.4876948893070221" arg: "0.43568745255470276" arg: "0.565092146396637" arg: "0.29359373450279236" arg: "0.21053338050842285" arg: "0.3244520127773285" arg: "0.3750203549861908" arg: "0.21750782430171967" arg: "0.0762089267373085" arg: "-0.08389374613761902" arg: "-0.1069231778383255" arg: "-0.03870454430580139" arg: "-0.05467657372355461" arg: "-0.1801743060350418" arg: "-0.16087990999221802" arg: "-0.142457976937294" arg: "-0.17574858665466309" arg: "-0.052057504653930664" arg: "-0.022876683622598648" arg: "0.11125413328409195" arg: "0.04914820194244385" arg: "-0.46803581714630127" arg: "0.06290580332279205"
+    arg: "-0.07477306574583054" arg: "0.21766617894172668" arg: "-0.22997407615184784" arg: "0.034543294459581375" arg: "-0.013903528451919556" arg: "-0.18765689432621002" arg: "-0.0831596627831459" arg: "-0.24837662279605865" arg: "-0.3817770779132843" arg: "-0.27927181124687195" arg: "-0.1098317876458168" arg: "-0.10399161279201508" arg: "0.07339660823345184" arg: "0.24020932614803314" arg: "0.054254304617643356" arg: "-0.29698851704597473" arg: "-0.8958835601806641" arg: "-0.5978922247886658" arg: "-0.25239694118499756" arg: "0.3084123134613037" arg: "0.7305506467819214" arg: "0.4846576452255249" arg: "0.2881616950035095" arg: "0.41025322675704956" arg: "0.5327757596969604" arg: "0.32260239124298096" arg: "0.31244078278541565" arg: "0.19641001522541046"
+    arg: "0.4518318772315979" arg: "0.0918312519788742" arg: "0.21931017935276031" arg: "-0.2809772789478302" arg: "0.04512198641896248" arg: "-0.12032749503850937" arg: "0.13106974959373474" arg: "0.16737745702266693" arg: "-0.06265480816364288" arg: "-0.12420768290758133" arg: "0.026834065094590187" arg: "0.057427167892456055" arg: "0.13727468252182007" arg: "0.17098888754844666" arg: "0.1360159069299698" arg: "-0.29676762223243713" arg: "0.02863573096692562" arg: "0.11650068312883377" arg: "0.08228091150522232" arg: "0.0004863425565417856" arg: "0.014332028105854988" arg: "0.21943029761314392" arg: "0.2125539928674698" arg: "0.07996927946805954" arg: "0.39755111932754517" arg: "0.24480314552783966" arg: "-0.03607768565416336" arg: "-0.04457511007785797"
+    arg: "-0.1347951889038086" arg: "-0.12047966569662094" arg: "0.10575897246599197" arg: "0.05170439928770065" arg: "-0.1254684031009674" arg: "-0.014801939949393272" arg: "0.21512190997600555" arg: "-0.10265995562076569" arg: "0.27830609679222107" arg: "0.023503802716732025" arg: "-0.028669001534581184" arg: "0.39614439010620117" arg: "0.6066746711730957" arg: "0.2178766429424286" arg: "-0.011598336510360241" arg: "0.3191794753074646" arg: "0.21298977732658386" arg: "-0.10889667272567749" arg: "-0.019525714218616486" arg: "0.030179403722286224" arg: "0.15849147737026215" arg: "0.22630034387111664" arg: "0.009056099690496922" arg: "-0.05425706133246422" arg: "-0.0458354689180851" arg: "-0.21161434054374695" arg: "-0.1947891265153885" arg: "-0.263959139585495"
+    arg: "0.027783045545220375" arg: "-0.04745020717382431" arg: "0.3300989270210266" arg: "0.28470176458358765" arg: "-0.17005865275859833" arg: "-0.07439357787370682" arg: "0.1667899340391159" arg: "0.45858997106552124" arg: "0.37785887718200684" arg: "0.6112445592880249" arg: "0.3187272250652313" arg: "-0.06941720843315125" arg: "-0.36996451020240784" arg: "-0.31387877464294434" arg: "-0.639223575592041" arg: "-0.44944101572036743" arg: "-0.06550875306129456" arg: "0.06676022708415985" arg: "0.007136007770895958" arg: "-0.21558785438537598" arg: "-0.15798640251159668" arg: "0.01950899139046669" arg: "-0.2211693674325943" arg: "0.013605713844299316" arg: "-0.19591258466243744" arg: "-0.38659483194351196" arg: "0.013284237124025822" arg: "-0.22611309587955475"
+    arg: "-0.15597084164619446" arg: "0.17743876576423645" arg: "-0.037310726940631866" arg: "-0.024244168773293495" arg: "0.2377604991197586" arg: "0.44537392258644104" arg: "0.31278473138809204" arg: "0.1272803544998169" arg: "0.013553287833929062" arg: "0.254456490278244" arg: "0.08175382763147354" arg: "-0.20459230244159698" arg: "-0.2673284709453583" arg: "-0.2404957413673401" arg: "-0.14864800870418549" arg: "0.20921550691127777" arg: "0.29682257771492004" arg: "0.21920187771320343" arg: "0.2311340570449829" arg: "0.011728049255907536" arg: "-0.14417873322963715" arg: "-0.18177829682826996" arg: "-0.05804318189620972" arg: "-0.4260285198688507" arg: "-0.23524904251098633" arg: "0.12287767231464386" arg: "0.2705034911632538" arg: "0.07427411526441574"
+    arg: "-0.0016438784077763557" arg: "0.24241822957992554" arg: "0.48539218306541443" arg: "-0.04733000695705414" arg: "0.26057326793670654" arg: "0.1521688848733902" arg: "0.051105279475450516" arg: "0.30896538496017456" arg: "0.2954115569591522" arg: "0.14518126845359802" arg: "-0.07679435610771179" arg: "-0.17924435436725616" arg: "-0.3208692669868469" arg: "-0.7328397631645203" arg: "-0.02372976578772068" arg: "0.1274699866771698" arg: "-0.06885138899087906" arg: "-0.35549283027648926" arg: "-0.4305097162723541" arg: "-0.3341798782348633" arg: "-0.22487765550613403" arg: "-0.0018067393684759736" arg: "0.30461153388023376" arg: "-0.04262997582554817" arg: "0.18400070071220398" arg: "0.16000767052173615" arg: "0.5017845034599304" arg: "-0.26547643542289734"
+    arg: "-0.22279107570648193" arg: "-0.37462612986564636" arg: "-0.11375132948160172" arg: "-0.318785697221756" arg: "-0.04256489500403404" arg: "0.023857025429606438" arg: "0.24660463631153107" arg: "0.2548283040523529" arg: "0.1715206801891327" arg: "-0.19845722615718842" arg: "-0.5138258934020996" arg: "-0.20912210643291473" arg: "-0.13793939352035522" arg: "0.12139105796813965" arg: "-0.08564136922359467" arg: "-0.11496538668870926" arg: "0.06404484808444977" arg: "0.06541986018419266" arg: "-0.05833537131547928" arg: "0.33602604269981384" arg: "0.24190761148929596" arg: "0.23126089572906494" arg: "0.18084779381752014" arg: "0.10192841291427612" arg: "-0.19622290134429932" arg: "0.2051597237586975" arg: "0.37465494871139526" arg: "0.2135562300682068"
+    arg: "0.20038118958473206" arg: "-0.02999887615442276" arg: "0.27408668398857117" arg: "0.0680413618683815" arg: "0.521243155002594" arg: "0.33456870913505554" arg: "0.11001615226268768" arg: "0.4307146668434143" arg: "0.3331093490123749" arg: "0.3356601595878601" arg: "0.05947132036089897" arg: "0.26261594891548157" arg: "0.15773697197437286" arg: "0.19511407613754272" arg: "-0.14718492329120636" arg: "-0.3149983882904053" arg: "-0.3379979133605957" arg: "-0.2451634705066681" arg: "-0.03271918371319771" arg: "0.2850451171398163" arg: "0.041262079030275345" arg: "0.05697742477059364" arg: "0.10827737301588058" arg: "-0.10368494689464569" arg: "-0.6196221113204956" arg: "-0.44554245471954346" arg: "-0.18561461567878723" arg: "0.07922625541687012"
+    arg: "0.020840616896748543" arg: "0.08754434436559677" arg: "-0.07456904649734497" arg: "0.0035276953130960464" arg: "0.27878332138061523" arg: "0.11924267560243607" arg: "-0.023688653483986855" arg: "0.049631692469120026" arg: "-0.18165265023708344" arg: "0.13382099568843842" arg: "-0.14947009086608887" arg: "-0.27999353408813477" arg: "0.11579195410013199" arg: "0.23342294991016388" arg: "0.26458871364593506" arg: "0.16960042715072632" arg: "0.2244962602853775" arg: "0.251582533121109" arg: "0.047410279512405396" arg: "-0.3055225610733032" arg: "-0.0922807902097702" arg: "-0.0008149942150339484" arg: "-0.0030961039010435343" arg: "0.3732677698135376" arg: "0.358204185962677" arg: "0.15451878309249878" arg: "0.28581294417381287" arg: "-0.008904639631509781"
+    arg: "0.01313134003430605" arg: "0.10318631678819656" arg: "0.12054811418056488" arg: "0.2703510820865631" arg: "-0.012742577120661736" arg: "-0.062323760241270065" arg: "0.09344484657049179" arg: "0.022521527484059334" arg: "0.15448175370693207" arg: "0.14388494193553925" arg: "-0.23548845946788788" arg: "-0.10205905884504318" arg: "-0.28026899695396423" arg: "-0.5158746838569641" arg: "-0.2526220381259918" arg: "-0.018526393920183182" arg: "-0.2256275862455368" arg: "-0.1908768117427826" arg: "-0.013978122733533382" arg: "-0.0744546428322792" arg: "-0.14520809054374695" arg: "-0.05685105547308922" arg: "0.02905760332942009" arg: "0.08797142654657364" arg: "-0.17073869705200195" arg: "0.1176731064915657" arg: "-0.07420363277196884" arg: "0.05669660493731499"
+    arg: "-0.2321733981370926" arg: "0.15404652059078217" arg: "-0.2614485025405884" arg: "-0.19935357570648193" arg: "-0.12706783413887024" arg: "0.13061459362506866" arg: "-0.04816088452935219" arg: "-0.06196342036128044" arg: "0.09632396697998047" arg: "0.5340875387191772" arg: "0.12526774406433105" arg: "0.018420275300741196" arg: "-0.012295903638005257" arg: "-0.07571853697299957" arg: "0.003750501200556755" arg: "0.21794241666793823" arg: "0.2857806086540222" arg: "-0.11748607456684113" arg: "0.010374456644058228" arg: "-0.1973239928483963" arg: "0.056574393063783646" arg: "-0.16627934575080872" arg: "-0.08241312205791473" arg: "-0.28860169649124146" arg: "-0.6715773940086365" arg: "-0.4119777977466583" arg: "-0.05268547311425209" arg: "0.09599226713180542"
+    arg: "-0.23489901423454285" arg: "-0.210902139544487" arg: "-0.4165542423725128" arg: "-0.1252453476190567" arg: "0.0586412139236927" arg: "-0.32756853103637695" arg: "-0.03619222715497017" arg: "-0.14618682861328125" arg: "-0.15533234179019928" arg: "-0.015258180908858776" arg: "0.23733921349048615" arg: "0.4892650842666626" arg: "0.9553998708724976" arg: "0.443565309047699" arg: "-0.022181924432516098" arg: "-0.09372548013925552" arg: "-0.04591056704521179" arg: "-0.03852088004350662" arg: "-0.18001845479011536" arg: "0.13030503690242767" arg: "0.33781635761260986" arg: "-0.00877282302826643" arg: "-0.11648697406053543" arg: "0.17854802310466766" arg: "0.33019575476646423" arg: "0.110745869576931" arg: "0.16787387430667877" arg: "-0.003876873990520835"
+    arg: "-0.23130182921886444" arg: "0.15203757584095" arg: "-0.1979227513074875" arg: "-0.27961796522140503" arg: "-0.16501222550868988" arg: "-0.1727743148803711" arg: "-0.08420325070619583" arg: "-0.03661131486296654" arg: "0.022991381585597992" arg: "0.3267931640148163" arg: "0.17916983366012573" arg: "-0.21305087208747864" arg: "-0.4732128381729126" arg: "-0.14580094814300537" arg: "0.21602611243724823" arg: "0.4339281916618347" arg: "0.021353665739297867" arg: "0.1897697150707245" arg: "-0.18888473510742188" arg: "0.10367560386657715" arg: "-0.17847439646720886" arg: "0.10388167202472687" arg: "-0.04788142070174217" arg: "-0.05652203410863876" arg: "-0.23099040985107422" arg: "-0.16391621530056" arg: "0.06451118737459183" arg: "0.13220994174480438"
+    arg: "0.2434273064136505" arg: "-0.11095169931650162" arg: "0.13803741335868835" arg: "0.21899642050266266" arg: "-0.05742649734020233" arg: "0.2324332445859909" arg: "0.3781294524669647" arg: "0.0380890890955925" arg: "0.34526804089546204" arg: "0.31572604179382324" arg: "0.18964345753192902" arg: "0.21782329678535461" arg: "0.23522818088531494" arg: "-0.3473344147205353" arg: "0.1344681978225708" arg: "-0.11885730922222137" arg: "-0.1880505084991455" arg: "-0.1195577010512352" arg: "-0.060020171105861664" arg: "0.289211243391037" arg: "0.012796066701412201" arg: "-0.03657015040516853" arg: "0.09980439394712448" arg: "0.3341149389743805" arg: "0.15382571518421173" arg: "-0.09110640734434128" arg: "-0.2671661674976349" arg: "-0.3361131548881531"
+    arg: "0.04869367554783821" arg: "-0.29407668113708496" arg: "-0.21786509454250336" arg: "-0.021602999418973923" arg: "0.055243782699108124" arg: "0.3378455638885498" arg: "0.22096234560012817" arg: "0.4419727325439453" arg: "0.21845094859600067" arg: "0.3761645555496216" arg: "-0.047762319445610046" arg: "-0.21078196167945862" arg: "-0.32595154643058777" arg: "-0.3116377592086792" arg: "-0.2809823453426361" arg: "-0.36722978949546814" arg: "-0.721940279006958" arg: "-0.4297282099723816" arg: "-0.48180773854255676" arg: "-0.4869074821472168" arg: "-0.5374395847320557" arg: "-0.28183409571647644" arg: "-0.18897175788879395" arg: "-0.2543872594833374" arg: "-0.27631592750549316" arg: "0.06477966159582138" arg: "0.2254650592803955" arg: "0.3261754810810089"
+    arg: "-0.18023546040058136" arg: "-0.3768996596336365" arg: "-0.22269578278064728" arg: "0.018682828173041344" arg: "-0.11290131509304047" arg: "-0.39474353194236755" arg: "-0.03385370224714279" arg: "0.21576857566833496" arg: "0.177229642868042" arg: "0.042106978595256805" arg: "-0.24501168727874756" arg: "0.08456140011548996" arg: "-0.2069004774093628" arg: "0.09091164171695709" arg: "0.10230324417352676" arg: "0.12411151826381683" arg: "-0.04576163366436958" arg: "-0.09247612208127975" arg: "-0.2126251608133316" arg: "0.17474356293678284" arg: "0.024457167834043503" arg: "0.044812608510255814" arg: "-0.24063478410243988" arg: "-0.18941839039325714" arg: "-0.060114409774541855" arg: "0.05715743824839592" arg: "-0.04718327894806862" arg: "-0.1155826672911644"
+  }
+}
+operand {
+  name: "input_to_cell_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 28 }
+  filler {
+    tag: "explicit"
+    arg: "-0.14322419464588165" arg: "-0.03543514385819435" arg: "-0.37075191736221313" arg: "-0.11541029065847397" arg: "-0.16138359904289246" arg: "0.12561601400375366" arg: "-0.02921752631664276" arg: "0.03920969367027283" arg: "-0.06703276187181473" arg: "0.1305120289325714" arg: "0.05381251871585846" arg: "0.004025470931082964" arg: "-0.23657721281051636" arg: "-0.18540850281715393" arg: "0.1416579633951187" arg: "0.5400514602661133" arg: "0.3134595453739166" arg: "-0.08414574712514877" arg: "-0.25919538736343384" arg: "-0.2152969390153885" arg: "-0.13050059974193573" arg: "-0.08802143484354019" arg: "-0.01510115247219801" arg: "0.008360753767192364" arg: "0.2349795401096344" arg: "0.1903218775987625" arg: "-0.06257583945989609" arg: "-0.28226447105407715"
+    arg: "0.10948476195335388" arg: "0.1869562417268753" arg: "-0.022029563784599304" arg: "-0.05470750108361244" arg: "0.26676875352859497" arg: "0.0009490635129623115" arg: "0.04640969634056091" arg: "0.17441663146018982" arg: "-0.15971945226192474" arg: "0.09322939813137054" arg: "0.12405950576066971" arg: "-0.07882469892501831" arg: "-0.03132086992263794" arg: "-0.3313288986682892" arg: "-0.26879292726516724" arg: "-0.22513961791992188" arg: "-0.17825880646705627" arg: "-0.28025585412979126" arg: "-0.026966359466314316" arg: "0.23260623216629028" arg: "-0.20367872714996338" arg: "0.04657059162855148" arg: "-0.1342175006866455" arg: "-0.039766665548086166" arg: "0.055720679461956024" arg: "-0.08040647208690643" arg: "-0.2785663306713104" arg: "-0.20658431947231293"
+    arg: "-0.2845764458179474" arg: "-0.12577861547470093" arg: "0.27772387862205505" arg: "0.31503239274024963" arg: "0.18268036842346191" arg: "0.08590365201234818" arg: "0.2304043024778366" arg: "-0.1624426394701004" arg: "-0.06931187212467194" arg: "0.06844695657491684" arg: "-0.21547579765319824" arg: "-0.25547099113464355" arg: "-0.17882278561592102" arg: "-0.05633649230003357" arg: "0.40184134244918823" arg: "0.22536933422088623" arg: "0.25438448786735535" arg: "0.0452042818069458" arg: "0.18264786899089813" arg: "0.08751898258924484" arg: "0.2497790902853012" arg: "0.10087688267230988" arg: "0.20765537023544312" arg: "0.14631463587284088" arg: "0.21937739849090576" arg: "0.473107248544693" arg: "0.4417702257633209" arg: "0.11503241211175919"
+    arg: "0.0008742575882934034" arg: "0.06207135319709778" arg: "-0.08988912403583527" arg: "0.05438712611794472" arg: "0.5871970653533936" arg: "0.48566651344299316" arg: "0.5400002002716064" arg: "0.6258884072303772" arg: "0.7265797853469849" arg: "0.7820643782615662" arg: "0.7500579953193665" arg: "0.711792528629303" arg: "0.5214825868606567" arg: "0.38864561915397644" arg: "0.14345985651016235" arg: "0.13323532044887543" arg: "-0.05931813269853592" arg: "0.08922331035137177" arg: "0.0952056497335434" arg: "0.06592926383018494" arg: "0.016503572463989258" arg: "0.0451623909175396" arg: "0.02558143064379692" arg: "-0.07171337306499481" arg: "0.016712144017219543" arg: "-0.2027517557144165" arg: "-0.2736990451812744" arg: "-0.47692665457725525"
+    arg: "-0.048994168639183044" arg: "-0.060884907841682434" arg: "0.05837605521082878" arg: "0.13532890379428864" arg: "0.08775680512189865" arg: "0.20183657109737396" arg: "0.20148716866970062" arg: "0.18878254294395447" arg: "0.053453002125024796" arg: "0.24967534840106964" arg: "0.05353411287069321" arg: "0.03676244989037514" arg: "0.02308550290763378" arg: "0.03319866210222244" arg: "0.26316580176353455" arg: "-0.08851990103721619" arg: "-0.02426389791071415" arg: "-0.5592305064201355" arg: "-0.6976458430290222" arg: "-0.30749642848968506" arg: "0.17886534333229065" arg: "-0.08636200428009033" arg: "0.3144587576389313" arg: "0.04458259046077728" arg: "0.03932805731892586" arg: "0.2133030742406845" arg: "0.46437597274780273" arg: "0.3068107068538666"
+    arg: "0.02069045975804329" arg: "0.09891658276319504" arg: "0.14216284453868866" arg: "-0.17771059274673462" arg: "0.017059292644262314" arg: "0.14904333651065826" arg: "-0.012604858726263046" arg: "0.20474286377429962" arg: "0.03844248130917549" arg: "0.03829401358962059" arg: "-0.08334273099899292" arg: "-0.3701476454734802" arg: "-0.4113644063472748" arg: "-0.142000213265419" arg: "0.1457018107175827" arg: "-0.13357846438884735" arg: "0.0584041103720665" arg: "-0.12736332416534424" arg: "-0.10378235578536987" arg: "-0.1306902915239334" arg: "-0.13070425391197205" arg: "-0.07379347831010818" arg: "-0.1626625508069992" arg: "-0.268823504447937" arg: "-0.11865845322608948" arg: "-0.17326758801937103" arg: "-0.538648784160614" arg: "-0.3999563753604889"
+    arg: "-0.0882401168346405" arg: "0.08313216269016266" arg: "0.2604030966758728" arg: "0.12791591882705688" arg: "-0.23485024273395538" arg: "-0.20249411463737488" arg: "-0.07451540231704712" arg: "0.10412992537021637" arg: "-0.0015763905830681324" arg: "0.1795114427804947" arg: "0.18463212251663208" arg: "0.23361526429653168" arg: "0.4148138165473938" arg: "0.4245856702327728" arg: "0.26099058985710144" arg: "-0.21013644337654114" arg: "-0.07617087662220001" arg: "-0.08196636289358139" arg: "-0.1636158674955368" arg: "-0.08406206220388412" arg: "-0.1299818903207779" arg: "-0.05562596768140793" arg: "0.11821522563695908" arg: "0.14262455701828003" arg: "0.1502220183610916" arg: "0.5341688394546509" arg: "0.21171703934669495" arg: "-0.07061432301998138"
+    arg: "0.25366824865341187" arg: "0.24823081493377686" arg: "0.16083313524723053" arg: "0.2527167499065399" arg: "0.23111572861671448" arg: "0.058159034699201584" arg: "-0.09976936876773834" arg: "-0.05263379588723183" arg: "0.03194894641637802" arg: "0.0328059084713459" arg: "-0.02465866319835186" arg: "-0.31380391120910645" arg: "-0.2766170799732208" arg: "-0.3121233880519867" arg: "-0.12022770196199417" arg: "-0.08422422409057617" arg: "0.09094454348087311" arg: "0.03537796065211296" arg: "-0.003948139026761055" arg: "0.08172310143709183" arg: "-0.027632299810647964" arg: "-0.08769393712282181" arg: "0.33617734909057617" arg: "0.3525821268558502" arg: "-0.089286670088768" arg: "-0.11947164684534073" arg: "0.05121488869190216" arg: "0.0871497243642807"
+    arg: "0.13053961098194122" arg: "0.02525678649544716" arg: "0.2623661160469055" arg: "-0.031171713024377823" arg: "0.03309366852045059" arg: "0.26168009638786316" arg: "0.2352420836687088" arg: "0.20169517397880554" arg: "0.04541448503732681" arg: "0.23629099130630493" arg: "0.21913982927799225" arg: "0.10123161971569061" arg: "0.021161029115319252" arg: "-0.0648307353258133" arg: "0.059982750564813614" arg: "0.05455026403069496" arg: "-0.031897980719804764" arg: "0.17262086272239685" arg: "0.018163474276661873" arg: "-0.1253158450126648" arg: "0.09696071594953537" arg: "0.08505688607692719" arg: "-0.07572611421346664" arg: "0.1470259726047516" arg: "0.38780710101127625" arg: "0.24831682443618774" arg: "0.2748945355415344" arg: "-0.05742323026061058"
+    arg: "-0.12352827936410904" arg: "-0.05199163407087326" arg: "0.32428112626075745" arg: "0.05729059875011444" arg: "-0.008717969991266727" arg: "-0.05560842528939247" arg: "0.15452425181865692" arg: "0.19150300323963165" arg: "0.27473723888397217" arg: "0.21984589099884033" arg: "0.17149868607521057" arg: "0.10660523176193237" arg: "-0.14198485016822815" arg: "-0.20040656626224518" arg: "-0.2936631441116333" arg: "-0.2659694254398346" arg: "-0.10012303292751312" arg: "0.18225309252738953" arg: "0.301752507686615" arg: "0.07418902218341827" arg: "0.415781170129776" arg: "0.20212115347385406" arg: "0.40611782670021057" arg: "0.4555768370628357" arg: "0.5562334656715393" arg: "0.28083983063697815" arg: "-0.0601402223110199" arg: "-0.006858934182673693"
+    arg: "-0.1211753711104393" arg: "-0.2732102572917938" arg: "0.05482182651758194" arg: "-0.21921852231025696" arg: "0.05309062823653221" arg: "-0.1784476339817047" arg: "-0.2689800560474396" arg: "-0.3562871515750885" arg: "-0.18823957443237305" arg: "-0.29951784014701843" arg: "-0.13354501128196716" arg: "0.11218584328889847" arg: "0.3258461654186249" arg: "0.22131332755088806" arg: "0.40942832827568054" arg: "0.21726171672344208" arg: "-0.026453329250216484" arg: "0.2588849663734436" arg: "-0.011599023826420307" arg: "0.13138829171657562" arg: "-0.02606634423136711" arg: "-0.022416120395064354" arg: "-0.21375882625579834" arg: "-0.07201182097196579" arg: "-0.07097262144088745" arg: "-0.21650902926921844" arg: "0.10855670273303986" arg: "-0.22552341222763062"
+    arg: "-0.018311869353055954" arg: "-0.01195024698972702" arg: "-0.18364953994750977" arg: "-0.11891163140535355" arg: "-0.08975464850664139" arg: "-0.03872677683830261" arg: "-0.1561228632926941" arg: "0.08454786241054535" arg: "0.07633733749389648" arg: "-0.0810701847076416" arg: "0.07330472767353058" arg: "0.08883491903543472" arg: "0.10723859816789627" arg: "0.13773062825202942" arg: "-0.022353384643793106" arg: "-0.06568673998117447" arg: "-0.10638830810785294" arg: "-0.4936034679412842" arg: "-0.3799048960208893" arg: "-0.34529590606689453" arg: "-0.5483863353729248" arg: "-0.21261106431484222" arg: "-0.525896430015564" arg: "-0.27971628308296204" arg: "-0.45189738273620605" arg: "-0.11105624586343765" arg: "-0.13174773752689362" arg: "-0.2162051945924759"
+    arg: "-0.05546094477176666" arg: "0.07048869132995605" arg: "-0.009296463802456856" arg: "0.17465157806873322" arg: "-0.13712915778160095" arg: "-0.18633928894996643" arg: "-0.09989194571971893" arg: "-0.1971769630908966" arg: "-0.0062749506905674934" arg: "-0.06607092171907425" arg: "-0.051594603806734085" arg: "0.09821145981550217" arg: "0.12090478837490082" arg: "0.2579136788845062" arg: "-0.11981549859046936" arg: "-0.16096201539039612" arg: "-0.20961809158325195" arg: "-0.3041784167289734" arg: "-0.12349500507116318" arg: "0.012003187090158463" arg: "-0.008235737681388855" arg: "0.157791867852211" arg: "0.034064337611198425" arg: "0.37696516513824463" arg: "0.41845211386680603" arg: "0.34168341755867004" arg: "0.1698485016822815" arg: "0.12188931554555893"
+    arg: "0.2781696915626526" arg: "-0.16692659258842468" arg: "0.19220511615276337" arg: "0.2646377682685852" arg: "0.430692583322525" arg: "0.1798837035894394" arg: "0.304645836353302" arg: "0.2644904553890228" arg: "0.4548005759716034" arg: "0.11960816383361816" arg: "0.28225284814834595" arg: "0.2256070226430893" arg: "0.11082617938518524" arg: "0.010454414412379265" arg: "0.029175851494073868" arg: "-0.07078122347593307" arg: "0.1820518672466278" arg: "-0.044396884739398956" arg: "0.19291731715202332" arg: "0.16714687645435333" arg: "-0.07964447140693665" arg: "0.17365328967571259" arg: "0.04092591255903244" arg: "0.11975318193435669" arg: "0.295398473739624" arg: "0.2982410788536072" arg: "-0.02866576984524727" arg: "0.016789700835943222"
+    arg: "-0.012119884602725506" arg: "0.05438081920146942" arg: "-0.22899110615253448" arg: "-0.2591288983821869" arg: "-0.054602570831775665" arg: "-0.14894776046276093" arg: "0.1144491583108902" arg: "-0.12802022695541382" arg: "0.2082153707742691" arg: "0.2738417685031891" arg: "0.2696736752986908" arg: "0.20811103284358978" arg: "-0.23155954480171204" arg: "-0.16073471307754517" arg: "0.19584868848323822" arg: "0.33939072489738464" arg: "0.3128233253955841" arg: "0.235674649477005" arg: "0.2830577492713928" arg: "0.006479979958385229" arg: "0.044619105756282806" arg: "-0.1676308512687683" arg: "-0.2677950859069824" arg: "-0.43867629766464233" arg: "-0.011492089368402958" arg: "0.15210457146167755" arg: "0.07373664528131485" arg: "-0.1156914010643959"
+    arg: "-0.013776483945548534" arg: "-0.018127629533410072" arg: "0.05377393960952759" arg: "0.29386937618255615" arg: "0.2213279902935028" arg: "0.07069018483161926" arg: "0.21774591505527496" arg: "0.2090938538312912" arg: "0.24711604416370392" arg: "0.6695317625999451" arg: "0.46329638361930847" arg: "0.09031569212675095" arg: "0.18645460903644562" arg: "-0.30952781438827515" arg: "0.03084215708076954" arg: "-0.07405883073806763" arg: "0.23570561408996582" arg: "-0.042296942323446274" arg: "0.06679017841815948" arg: "0.04472753405570984" arg: "0.10317760705947876" arg: "0.02574877440929413" arg: "0.16524934768676758" arg: "-0.18901412189006805" arg: "-0.002970139030367136" arg: "0.1698213666677475" arg: "-0.058196987956762314" arg: "-0.16562488675117493"
+    arg: "-0.05622567981481552" arg: "-0.15602625906467438" arg: "0.03411481902003288" arg: "0.05091657117009163" arg: "-0.06706684082746506" arg: "0.2647324204444885" arg: "-0.1097978800535202" arg: "-0.01716734655201435" arg: "0.07656659185886383" arg: "0.08834546059370041" arg: "0.10541308671236038" arg: "0.09015700966119766" arg: "-0.05200522020459175" arg: "-0.5801278948783875" arg: "-0.28525060415267944" arg: "0.23951329290866852" arg: "0.15975099802017212" arg: "0.07012801617383957" arg: "-0.008377078920602798" arg: "-0.05613056570291519" arg: "0.017154719680547714" arg: "-0.11925014108419418" arg: "-0.1528388410806656" arg: "-0.08968795090913773" arg: "0.10360284149646759" arg: "-0.0059541938826441765" arg: "-0.15356747806072235" arg: "-0.06125378981232643"
+    arg: "-0.2849915325641632" arg: "0.04452868923544884" arg: "-0.1001860573887825" arg: "-0.051032423973083496" arg: "-0.3187020719051361" arg: "-0.4221560060977936" arg: "-0.026225173845887184" arg: "-0.17684528231620789" arg: "-0.23219993710517883" arg: "-0.31752654910087585" arg: "-0.16388170421123505" arg: "-0.408907026052475" arg: "-0.22536908090114594" arg: "-0.06049299240112305" arg: "0.16050300002098083" arg: "-0.0070655131712555885" arg: "-0.04127126559615135" arg: "0.025422271341085434" arg: "0.14560039341449738" arg: "0.016476156190037727" arg: "-0.11462834477424622" arg: "0.04149504005908966" arg: "-0.028964219614863396" arg: "-0.17375178635120392" arg: "0.2279641181230545" arg: "0.16986951231956482" arg: "-0.021325843408703804" arg: "0.07627731561660767"
+    arg: "0.26788678765296936" arg: "0.18480995297431946" arg: "0.27279922366142273" arg: "0.09604113548994064" arg: "0.11791739612817764" arg: "0.158638134598732" arg: "-0.06370259821414948" arg: "0.012692139483988285" arg: "-0.12293782830238342" arg: "0.15729208290576935" arg: "-0.03052298165857792" arg: "0.011496515944600105" arg: "-0.31932440400123596" arg: "-0.400392085313797" arg: "-0.3828366696834564" arg: "-0.03249126672744751" arg: "0.19932818412780762" arg: "0.15364520251750946" arg: "0.04440109804272652" arg: "0.08241084218025208" arg: "0.3154240846633911" arg: "0.30371105670928955" arg: "0.2279568612575531" arg: "0.15496046841144562" arg: "0.20785588026046753" arg: "0.061398304998874664" arg: "-0.4503801465034485" arg: "-0.29032525420188904"
+    arg: "-0.14293214678764343" arg: "-0.07592987269163132" arg: "-0.13592901825904846" arg: "0.0590178519487381" arg: "0.05973733589053154" arg: "0.16063377261161804" arg: "0.0970887765288353" arg: "0.005558301229029894" arg: "-0.06746333837509155" arg: "-0.08073955029249191" arg: "-0.11676499992609024" arg: "-0.030398180708289146" arg: "0.020192358642816544" arg: "-0.20045405626296997" arg: "-0.33653098344802856" arg: "-0.009017355740070343" arg: "-0.20934724807739258" arg: "0.1834343820810318" arg: "-0.2903430759906769" arg: "-0.12888988852500916" arg: "-0.39840051531791687" arg: "-0.19070746004581451" arg: "-0.40943092107772827" arg: "-0.2717587947845459" arg: "0.20603778958320618" arg: "-0.3883904814720154" arg: "-0.039434246718883514" arg: "-0.082768514752388"
+  }
+}
+operand {
+  name: "input_to_output_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 28 }
+  filler {
+    tag: "explicit"
+    arg: "-0.13361161947250366" arg: "0.2535432279109955" arg: "-0.12359361350536346" arg: "-0.01544901356101036" arg: "-0.1801277995109558" arg: "-0.17746007442474365" arg: "-0.15477000176906586" arg: "0.05789067968726158" arg: "-0.05010955408215523" arg: "0.16625314950942993" arg: "-0.05529220774769783" arg: "0.13654044270515442" arg: "0.08228389918804169" arg: "-0.2581821084022522" arg: "-0.2971959412097931" arg: "0.019851312041282654" arg: "0.06781011819839478" arg: "0.256449818611145" arg: "0.9392919540405273" arg: "0.6971920132637024" arg: "0.9516055583953857" arg: "0.564976692199707" arg: "0.4223553538322449" arg: "-0.4222773611545563" arg: "-0.08974764496088028" arg: "-0.24621441960334778" arg: "-0.39981308579444885" arg: "-0.22260607779026031"
+    arg: "-0.08982681483030319" arg: "0.031929973512887955" arg: "0.07003000378608704" arg: "0.1441173106431961" arg: "-0.3362192213535309" arg: "0.002978335367515683" arg: "-0.17029152810573578" arg: "0.09627201408147812" arg: "0.08026549220085144" arg: "-0.0912403017282486" arg: "0.2660815119743347" arg: "0.759759247303009" arg: "0.5078319311141968" arg: "0.5035433173179626" arg: "0.3447149693965912" arg: "-0.003492701565846801" arg: "0.31759369373321533" arg: "0.009249270893633366" arg: "0.1448427438735962" arg: "-0.2437012791633606" arg: "-0.23719677329063416" arg: "-0.4108390510082245" arg: "-0.021009216085076332" arg: "-0.06669881194829941" arg: "-0.27399614453315735" arg: "-0.3611904978752136" arg: "-0.10668569058179855" arg: "0.04301835596561432"
+    arg: "0.14136508107185364" arg: "-0.20753467082977295" arg: "-0.12640978395938873" arg: "-0.0813758373260498" arg: "0.061406463384628296" arg: "0.22040338814258575" arg: "-0.10975504666566849" arg: "0.2034149467945099" arg: "0.045333147048950195" arg: "0.2408442199230194" arg: "-0.07402602583169937" arg: "-0.05966627970337868" arg: "0.222041517496109" arg: "-0.0912318304181099" arg: "0.5839067697525024" arg: "0.638180136680603" arg: "1.0717300176620483" arg: "0.8844493627548218" arg: "0.9932028651237488" arg: "0.7096611857414246" arg: "0.6521549224853516" arg: "0.2312447875738144" arg: "0.3880407214164734" arg: "0.48241227865219116" arg: "0.5459808111190796" arg: "0.21763679385185242" arg: "0.10172371566295624" arg: "0.02386627160012722"
+    arg: "0.22363805770874023" arg: "0.055896203964948654" arg: "0.061871618032455444" arg: "-0.17662213742733002" arg: "0.26071617007255554" arg: "0.47994768619537354" arg: "0.4157676696777344" arg: "0.3473230302333832" arg: "0.16535861790180206" arg: "0.3827962279319763" arg: "0.0706290602684021" arg: "0.38254064321517944" arg: "0.28826048970222473" arg: "0.3427451252937317" arg: "0.6095309257507324" arg: "0.3840809166431427" arg: "0.42494848370552063" arg: "0.5521677732467651" arg: "0.36389851570129395" arg: "0.32664236426353455" arg: "0.5916943550109863" arg: "0.2420167326927185" arg: "0.5305866599082947" arg: "0.2550817131996155" arg: "-0.2333785742521286" arg: "-0.472649484872818" arg: "-0.3964008092880249" arg: "-0.1272299885749817"
+    arg: "0.03291049599647522" arg: "0.0799076184630394" arg: "-0.14570799469947815" arg: "0.45667150616645813" arg: "0.2630525827407837" arg: "0.10890132933855057" arg: "-0.054551925510168076" arg: "0.0046729231253266335" arg: "0.21604961156845093" arg: "0.2429116815328598" arg: "-0.04739723354578018" arg: "-0.01825188286602497" arg: "-0.08609837293624878" arg: "0.20959249138832092" arg: "0.06693773716688156" arg: "-0.2576264441013336" arg: "-0.524071216583252" arg: "-0.23866695165634155" arg: "-0.110318124294281" arg: "0.21306151151657104" arg: "0.22546638548374176" arg: "0.04744942858815193" arg: "0.02165571227669716" arg: "-0.06357958167791367" arg: "-0.29460445046424866" arg: "0.09107953310012817" arg: "0.3577098548412323" arg: "0.2221963107585907"
+    arg: "0.01575949229300022" arg: "0.14965087175369263" arg: "0.1605227142572403" arg: "-0.1556941121816635" arg: "-0.16914859414100647" arg: "-0.01264619454741478" arg: "0.08487699925899506" arg: "-0.02691168338060379" arg: "-0.13130810856819153" arg: "0.1136908307671547" arg: "0.20643149316310883" arg: "0.5454611778259277" arg: "0.7623800039291382" arg: "0.7830140590667725" arg: "0.7355128526687622" arg: "0.6468384265899658" arg: "-0.00944082997739315" arg: "0.05574082210659981" arg: "0.02945263683795929" arg: "0.1267295628786087" arg: "0.20847344398498535" arg: "0.0877644270658493" arg: "0.5400432348251343" arg: "0.523216187953949" arg: "0.24031612277030945" arg: "-0.0941137745976448" arg: "-0.226273775100708" arg: "0.17517033219337463"
+    arg: "0.22727088630199432" arg: "-0.04092717543244362" arg: "-0.14521321654319763" arg: "-0.0876765251159668" arg: "-0.07828030735254288" arg: "-0.239556223154068" arg: "-0.28606486320495605" arg: "0.0778198391199112" arg: "-0.07059259712696075" arg: "0.01951042376458645" arg: "0.23620696365833282" arg: "0.1878870725631714" arg: "0.33765503764152527" arg: "0.476667582988739" arg: "-0.06353191286325455" arg: "0.017065072432160378" arg: "0.1915859431028366" arg: "0.5179688334465027" arg: "0.11117789149284363" arg: "0.20390978455543518" arg: "0.20907467603683472" arg: "0.21745766699314117" arg: "-0.33880436420440674" arg: "-0.39202067255973816" arg: "-0.6140245795249939" arg: "-0.015891058370471" arg: "-0.37634962797164917" arg: "-0.14018163084983826"
+    arg: "0.08723758161067963" arg: "-0.18214963376522064" arg: "0.04190217703580856" arg: "-0.05375084653496742" arg: "-0.09821449220180511" arg: "0.17345662415027618" arg: "-0.043591149151325226" arg: "0.2588083744049072" arg: "0.15015676617622375" arg: "0.39709022641181946" arg: "0.5637708902359009" arg: "0.44008538126945496" arg: "0.12622207403182983" arg: "-0.11229363828897476" arg: "-0.3005681335926056" arg: "-0.19021284580230713" arg: "-0.01430613361299038" arg: "0.08379453420639038" arg: "0.3318374454975128" arg: "-0.28072816133499146" arg: "-0.48265987634658813" arg: "-0.024674715474247932" arg: "0.04502909258008003" arg: "-0.17233917117118835" arg: "-0.11240006238222122" arg: "-0.2408943474292755" arg: "-0.3078864514827728" arg: "-0.16831211745738983"
+    arg: "0.12704502046108246" arg: "0.00693404208868742" arg: "0.3220159411430359" arg: "0.5156370997428894" arg: "0.3838233947753906" arg: "0.1953398436307907" arg: "0.17885588109493256" arg: "-0.09935329854488373" arg: "0.22020603716373444" arg: "0.2726762294769287" arg: "0.6566031575202942" arg: "0.4547414183616638" arg: "0.774775505065918" arg: "0.1015879288315773" arg: "0.11886601150035858" arg: "-0.05910573527216911" arg: "0.19138272106647491" arg: "0.33189404010772705" arg: "0.46459195017814636" arg: "0.5477021336555481" arg: "-0.03558523580431938" arg: "-0.26250338554382324" arg: "-0.18095482885837555" arg: "-0.6443997621536255" arg: "-0.5638570785522461" arg: "0.0682566836476326" arg: "0.1611909568309784" arg: "0.05711650475859642"
+    arg: "0.19428735971450806" arg: "0.18257075548171997" arg: "0.19051998853683472" arg: "0.0003557652235031128" arg: "0.23444350063800812" arg: "0.269832044839859" arg: "0.3485797345638275" arg: "0.4130585491657257" arg: "-0.15780729055404663" arg: "-0.33516737818717957" arg: "0.0878380611538887" arg: "0.01651749014854431" arg: "-0.32947224378585815" arg: "-0.09176459908485413" arg: "0.5332760214805603" arg: "0.2153072953224182" arg: "0.17049799859523773" arg: "0.1438971608877182" arg: "0.41453301906585693" arg: "0.03268708288669586" arg: "-0.10276871919631958" arg: "-0.051143575459718704" arg: "0.5329627394676208" arg: "0.626736044883728" arg: "0.6361001133918762" arg: "0.2502281665802002" arg: "0.07913760840892792" arg: "0.007263735868036747"
+    arg: "0.24060982465744019" arg: "-0.05091336369514465" arg: "0.035534802824258804" arg: "-0.07280046492815018" arg: "-0.1301843523979187" arg: "0.11763674765825272" arg: "0.10463829338550568" arg: "0.27148929238319397" arg: "-0.09400584548711777" arg: "-0.1651712954044342" arg: "0.17628471553325653" arg: "-0.08193076401948929" arg: "-0.15348084270954132" arg: "0.36272093653678894" arg: "0.34180593490600586" arg: "0.22390322387218475" arg: "0.824752151966095" arg: "0.778559148311615" arg: "0.8345740437507629" arg: "0.6250406503677368" arg: "0.8019649982452393" arg: "0.4851066470146179" arg: "0.5757401585578918" arg: "0.2889276444911957" arg: "0.2547096610069275" arg: "0.3385365605354309" arg: "0.1420331448316574" arg: "0.36819931864738464"
+    arg: "0.2515231668949127" arg: "0.030855854973196983" arg: "-0.10836786776781082" arg: "0.10643685609102249" arg: "-0.09548310190439224" arg: "0.04187479987740517" arg: "0.31773850321769714" arg: "0.3311438262462616" arg: "0.5351244211196899" arg: "0.1878986656665802" arg: "0.30104926228523254" arg: "0.4463382959365845" arg: "0.281780868768692" arg: "-0.1471078246831894" arg: "0.1783357411623001" arg: "0.07812053710222244" arg: "0.18911871314048767" arg: "0.4206305146217346" arg: "0.6459701657295227" arg: "0.4032178223133087" arg: "0.5664452314376831" arg: "0.33140894770622253" arg: "-0.025404682382941246" arg: "-0.3444240987300873" arg: "-0.2518601715564728" arg: "-0.5755497813224792" arg: "-0.42604678869247437" arg: "-0.09392133355140686"
+    arg: "0.2687152326107025" arg: "0.4339213967323303" arg: "0.2603331208229065" arg: "-0.025584589689970016" arg: "0.02291446179151535" arg: "0.21942569315433502" arg: "0.5105418562889099" arg: "0.24948522448539734" arg: "0.34695175290107727" arg: "0.32354483008384705" arg: "0.038374610245227814" arg: "-0.015621446073055267" arg: "0.41838541626930237" arg: "0.5760942697525024" arg: "0.5837766528129578" arg: "0.6488270163536072" arg: "0.03806944563984871" arg: "-0.30252325534820557" arg: "-0.5109604001045227" arg: "-0.6022301912307739" arg: "-0.4811290502548218" arg: "-0.23938359320163727" arg: "0.15395738184452057" arg: "0.31103089451789856" arg: "0.33056965470314026" arg: "0.3005286157131195" arg: "0.5069742798805237" arg: "-0.21318034827709198"
+    arg: "0.12214536964893341" arg: "-0.15474587678909302" arg: "0.3912317454814911" arg: "0.5372982621192932" arg: "0.4572385549545288" arg: "0.16643570363521576" arg: "0.0814824178814888" arg: "0.3027104139328003" arg: "0.23147137463092804" arg: "0.3813971281051636" arg: "0.15540477633476257" arg: "0.08324414491653442" arg: "0.519047200679779" arg: "0.37160855531692505" arg: "0.6911864280700684" arg: "0.21933679282665253" arg: "-0.009083807468414307" arg: "0.4009028375148773" arg: "0.5206535458564758" arg: "0.2980058789253235" arg: "0.4483773112297058" arg: "0.5295672416687012" arg: "0.6978735327720642" arg: "0.33932334184646606" arg: "0.3536893129348755" arg: "0.4484431743621826" arg: "-0.09894105792045593" arg: "-0.017690571025013924"
+    arg: "-0.1264471709728241" arg: "0.004865952301770449" arg: "-0.020671315491199493" arg: "-0.3312399685382843" arg: "-0.1591784507036209" arg: "0.22974173724651337" arg: "0.11263400316238403" arg: "-0.058258578181266785" arg: "-0.17727409303188324" arg: "0.22839052975177765" arg: "0.14666402339935303" arg: "-0.11730131506919861" arg: "0.05218665674328804" arg: "0.1637987196445465" arg: "0.13374973833560944" arg: "0.35711121559143066" arg: "0.5225153565406799" arg: "0.29755473136901855" arg: "0.2492614984512329" arg: "-0.18074239790439606" arg: "-0.14335356652736664" arg: "0.0032210154458880424" arg: "0.1897306889295578" arg: "0.22594426572322845" arg: "-0.5188416838645935" arg: "-0.35428524017333984" arg: "-0.030699916183948517" arg: "-0.08329521119594574"
+    arg: "0.0687800794839859" arg: "0.14417889714241028" arg: "0.08634546399116516" arg: "0.4291350245475769" arg: "0.2597505748271942" arg: "0.17120565474033356" arg: "0.006312726065516472" arg: "-0.08433850854635239" arg: "0.024435490369796753" arg: "0.01323175523430109" arg: "0.442842036485672" arg: "0.3250333070755005" arg: "0.36304062604904175" arg: "0.27663564682006836" arg: "0.554750382900238" arg: "0.40236881375312805" arg: "0.19184589385986328" arg: "0.4051419496536255" arg: "0.3190324902534485" arg: "0.202935591340065" arg: "0.23588521778583527" arg: "-0.060444705188274384" arg: "-0.1226918026804924" arg: "0.1633310168981552" arg: "0.13458260893821716" arg: "0.11727706342935562" arg: "0.3460041284561157" arg: "0.06878886371850967"
+    arg: "0.04063422232866287" arg: "-0.006821052171289921" arg: "0.3323805630207062" arg: "0.26635101437568665" arg: "0.3779240548610687" arg: "0.11733505874872208" arg: "-0.10527531802654266" arg: "-0.07571443170309067" arg: "-0.08959870040416718" arg: "0.34649038314819336" arg: "0.11251195520162582" arg: "-0.004889118485152721" arg: "-0.12246599048376083" arg: "-0.007686110679060221" arg: "-0.15067224204540253" arg: "-0.1337168663740158" arg: "-0.13265375792980194" arg: "-0.26213783025741577" arg: "0.011739661917090416" arg: "-0.2025691419839859" arg: "-0.09236078709363937" arg: "0.1839291900396347" arg: "0.039318203926086426" arg: "0.15644147992134094" arg: "0.08048530668020248" arg: "0.027235517278313637" arg: "-0.07634953409433365" arg: "-0.010376683436334133"
+    arg: "0.08199792355298996" arg: "-0.1779499351978302" arg: "0.07380770146846771" arg: "-0.0894157737493515" arg: "0.05990520119667053" arg: "0.2665881812572479" arg: "0.03642373904585838" arg: "-0.03619125485420227" arg: "0.011516132391989231" arg: "-0.213419109582901" arg: "0.01569538190960884" arg: "-0.1833057552576065" arg: "0.2472868114709854" arg: "0.8453921675682068" arg: "0.9291183948516846" arg: "0.9661735892295837" arg: "0.6590875387191772" arg: "0.6245248913764954" arg: "0.2572375535964966" arg: "-0.24237875640392303" arg: "0.010541471652686596" arg: "-0.22616639733314514" arg: "-0.027475513517856598" arg: "0.17520084977149963" arg: "-0.2040407955646515" arg: "0.046643588691949844" arg: "-0.263759046792984" arg: "-0.187980055809021"
+    arg: "-0.1663954257965088" arg: "0.13280753791332245" arg: "0.24432727694511414" arg: "-0.1797582507133484" arg: "-0.1539366990327835" arg: "0.4611888825893402" arg: "0.20097453892230988" arg: "0.32330435514450073" arg: "0.2807258665561676" arg: "0.31781595945358276" arg: "0.5915217995643616" arg: "0.3510398864746094" arg: "0.32014137506484985" arg: "0.355925977230072" arg: "0.20221084356307983" arg: "-0.06906703114509583" arg: "-0.1754205822944641" arg: "-0.04989638179540634" arg: "-0.363330215215683" arg: "0.0022096107713878155" arg: "-0.35765916109085083" arg: "0.12441515922546387" arg: "0.2386118769645691" arg: "0.14590708911418915" arg: "-0.20669437944889069" arg: "-0.13032864034175873" arg: "0.15810780227184296" arg: "0.10193713754415512"
+    arg: "0.3005771040916443" arg: "0.3376172184944153" arg: "0.15595310926437378" arg: "0.03881113976240158" arg: "0.049130022525787354" arg: "0.0412493497133255" arg: "0.5658847093582153" arg: "0.7288451194763184" arg: "0.08432513475418091" arg: "0.20872049033641815" arg: "0.2760712504386902" arg: "0.3288831114768982" arg: "0.29803207516670227" arg: "-0.14779740571975708" arg: "-0.04237861558794975" arg: "0.2661236822605133" arg: "0.5379334688186646" arg: "0.5947390198707581" arg: "-0.0807252898812294" arg: "-0.28580325841903687" arg: "-0.5205297470092773" arg: "-0.4381696879863739" arg: "-0.15092800557613373" arg: "0.048043206334114075" arg: "0.09882169216871262" arg: "0.17096076905727386" arg: "-0.2508130967617035" arg: "0.05531834065914154"
+  }
+}
+operand {
+  name: "recurrent_to_input_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "-0.4770118296146393" arg: "0.21366995573043823" arg: "0.46016961336135864" arg: "0.3251325190067291" arg: "0.13871631026268005" arg: "0.21228066086769104" arg: "-0.23697999119758606" arg: "-0.1725820153951645" arg: "0.16972437500953674" arg: "0.011942930519580841" arg: "0.2916385233402252" arg: "0.2334175705909729" arg: "0.8671041131019592" arg: "0.2224881947040558" arg: "-0.15734145045280457" arg: "-0.06410238891839981" arg: "-0.2018616646528244" arg: "0.2584409713745117" arg: "-0.3724243938922882" arg: "-0.3660326302051544"
+    arg: "-0.3494759500026703" arg: "0.4290483593940735" arg: "-0.11313329637050629" arg: "-0.059981122612953186" arg: "-0.14534664154052734" arg: "0.13317358493804932" arg: "-0.31460973620414734" arg: "0.4654754102230072" arg: "0.5217755436897278" arg: "0.36345264315605164" arg: "-0.06475342065095901" arg: "-0.611393392086029" arg: "-0.35462483763694763" arg: "0.21069355309009552" arg: "0.05305428430438042" arg: "-0.08702543377876282" arg: "-0.0562891848385334" arg: "-0.13899517059326172" arg: "0.3408608138561249" arg: "0.09720013290643692"
+    arg: "-0.4325777590274811" arg: "0.09612088650465012" arg: "0.11070075631141663" arg: "0.14977702498435974" arg: "-0.282085120677948" arg: "0.5554621815681458" arg: "-0.09023251384496689" arg: "0.12265370041131973" arg: "-0.04457980766892433" arg: "0.2098589986562729" arg: "0.21766719222068787" arg: "-0.19030693173408508" arg: "0.1261812448501587" arg: "-0.04023653268814087" arg: "0.09346041828393936" arg: "-0.24297039210796356" arg: "0.2858717441558838" arg: "-0.07445301115512848" arg: "-0.26624348759651184" arg: "-0.013775470666587353"
+    arg: "-0.06638309359550476" arg: "0.037332624197006226" arg: "-0.5706251263618469" arg: "0.6612618565559387" arg: "-0.21594902873039246" arg: "0.09637858718633652" arg: "0.4018155038356781" arg: "0.1754107028245926" arg: "0.18305723369121552" arg: "0.409424751996994" arg: "0.13018175959587097" arg: "0.45919686555862427" arg: "-0.3296961486339569" arg: "-0.27102723717689514" arg: "0.15050001442432404" arg: "-0.06254072487354279" arg: "-0.36290204524993896" arg: "-0.3503246605396271" arg: "0.40385496616363525" arg: "-0.32791852951049805"
+    arg: "0.009967965073883533" arg: "0.11481080204248428" arg: "0.25612783432006836" arg: "0.12932859361171722" arg: "0.2397402822971344" arg: "0.10742141306400299" arg: "0.505053699016571" arg: "-0.2892862856388092" arg: "0.4383614659309387" arg: "0.025106344372034073" arg: "0.43115267157554626" arg: "0.3953210115432739" arg: "0.13628928363323212" arg: "-0.1098363846540451" arg: "-0.11229805648326874" arg: "-0.09960231184959412" arg: "-0.0775841549038887" arg: "0.08288741856813431" arg: "0.21580594778060913" arg: "-0.07877210527658463"
+    arg: "-0.2979702353477478" arg: "-0.051671307533979416" arg: "0.05603557080030441" arg: "-0.07683657109737396" arg: "0.05431858450174332" arg: "0.5497500896453857" arg: "-0.3987792432308197" arg: "-0.18461892008781433" arg: "-0.021399449557065964" arg: "-0.08834270387887955" arg: "-0.38572776317596436" arg: "0.2081238031387329" arg: "0.11147842556238174" arg: "-0.18445543944835663" arg: "-0.14360877871513367" arg: "-0.02369718812406063" arg: "0.13527068495750427" arg: "0.15338219702243805" arg: "-0.33769914507865906" arg: "0.12053044140338898"
+    arg: "-0.31656137108802795" arg: "0.022205566987395287" arg: "-1.0957515239715576" arg: "-0.07192184031009674" arg: "-0.3568742573261261" arg: "0.08484519273042679" arg: "0.3926958739757538" arg: "0.15565809607505798" arg: "-0.08836834877729416" arg: "0.27079248428344727" arg: "0.038257431238889694" arg: "-0.08028512448072433" arg: "0.29435107111930847" arg: "-0.07749350368976593" arg: "-0.34698745608329773" arg: "-0.2787376940250397" arg: "0.1498851329088211" arg: "-0.20681093633174896" arg: "0.20965063571929932" arg: "0.14793993532657623"
+    arg: "-0.45168495178222656" arg: "-0.19468553364276886" arg: "0.004608047194778919" arg: "0.3809444308280945" arg: "-0.20797111093997955" arg: "-0.120585598051548" arg: "0.2809143364429474" arg: "0.06513983756303787" arg: "0.18850207328796387" arg: "0.2409287691116333" arg: "0.2608538866043091" arg: "-0.37541523575782776" arg: "-0.023761505261063576" arg: "0.1088205948472023" arg: "0.24460943043231964" arg: "0.3784101605415344" arg: "-0.1123291477560997" arg: "-0.1368710845708847" arg: "-0.5494782328605652" arg: "-0.18547306954860687"
+    arg: "0.3815309405326843" arg: "0.1690528243780136" arg: "-0.35149693489074707" arg: "-0.02372279018163681" arg: "0.10354622453451157" arg: "0.549765408039093" arg: "0.019203156232833862" arg: "-0.3717760443687439" arg: "0.0764758288860321" arg: "-0.2072433978319168" arg: "0.1771903783082962" arg: "0.44540902972221375" arg: "-0.32312753796577454" arg: "-0.2570071518421173" arg: "0.3598842918872833" arg: "0.01568111963570118" arg: "-0.10515885800123215" arg: "-0.0006535121938213706" arg: "-0.4027051329612732" arg: "-0.08736834675073624"
+    arg: "0.4984769821166992" arg: "-0.0011503007262945175" arg: "-0.535097062587738" arg: "0.23247945308685303" arg: "0.10292237997055054" arg: "-0.2671816647052765" arg: "0.46480339765548706" arg: "-0.3894844353199005" arg: "0.3963298201560974" arg: "0.14017800986766815" arg: "0.3199640214443207" arg: "0.4258514642715454" arg: "-0.16700509190559387" arg: "0.09393472969532013" arg: "0.010149846784770489" arg: "0.2868942320346832" arg: "-0.3524361252784729" arg: "-0.6936826705932617" arg: "0.003629873273894191" arg: "-0.09144237637519836"
+    arg: "-0.34517648816108704" arg: "0.1484774649143219" arg: "-0.24635784327983856" arg: "0.01039072871208191" arg: "0.38751891255378723" arg: "0.2944512963294983" arg: "-0.2962084412574768" arg: "0.08631572872400284" arg: "0.03221822530031204" arg: "-0.08134875446557999" arg: "-0.5519762635231018" arg: "0.06120099127292633" arg: "-0.049402546137571335" arg: "-0.4067457318305969" arg: "0.4055297076702118" arg: "0.06430382281541824" arg: "0.11064586043357849" arg: "0.06960950791835785" arg: "-0.31485283374786377" arg: "0.14682182669639587"
+    arg: "-0.19570083916187286" arg: "0.10931383073329926" arg: "-0.007622078992426395" arg: "-0.05675305798649788" arg: "-0.8250163793563843" arg: "-0.11235163360834122" arg: "-0.029461843892931938" arg: "0.7492899298667908" arg: "-0.27355697751045227" arg: "0.3595489263534546" arg: "0.23662255704402924" arg: "-0.3644302189350128" arg: "0.6238518357276917" arg: "-0.3704565465450287" arg: "0.19363875687122345" arg: "0.3572763204574585" arg: "-0.23370115458965302" arg: "0.42023247480392456" arg: "0.15355002880096436" arg: "-0.39801692962646484"
+    arg: "0.2795780897140503" arg: "0.019917918369174004" arg: "-0.14301127195358276" arg: "0.4369097948074341" arg: "-0.33128276467323303" arg: "-0.12198600172996521" arg: "0.6699290871620178" arg: "0.27996954321861267" arg: "-0.04728970676660538" arg: "-0.0063692545518279076" arg: "0.33021485805511475" arg: "0.048498980700969696" arg: "0.10616268217563629" arg: "0.21613231301307678" arg: "0.5218581557273865" arg: "-0.4211953282356262" arg: "0.10934742540121078" arg: "-0.3572162687778473" arg: "-0.04984986037015915" arg: "-0.3323499262332916"
+    arg: "0.05725480243563652" arg: "-0.6297563314437866" arg: "0.24617090821266174" arg: "0.016658928245306015" arg: "-0.3822592496871948" arg: "0.16846376657485962" arg: "-0.02593623474240303" arg: "0.5345171689987183" arg: "0.00866254698485136" arg: "-0.4684853255748749" arg: "0.5099982619285583" arg: "-0.1846589744091034" arg: "-0.31118252873420715" arg: "0.30074822902679443" arg: "0.4734266400337219" arg: "0.446226567029953" arg: "-0.2197871059179306" arg: "0.13501974940299988" arg: "-0.3300747275352478" arg: "-0.35672369599342346"
+    arg: "0.09107185155153275" arg: "0.15899214148521423" arg: "0.3112131953239441" arg: "-0.016727039590477943" arg: "-0.051409196108579636" arg: "0.1564004272222519" arg: "-0.4456101953983307" arg: "0.27580249309539795" arg: "0.0816519483923912" arg: "0.18884021043777466" arg: "-0.35784390568733215" arg: "0.15665903687477112" arg: "0.5751363635063171" arg: "-0.08250349014997482" arg: "-0.052263204008340836" arg: "-0.40821653604507446" arg: "0.14680282771587372" arg: "-0.1555611938238144" arg: "-0.02860925905406475" arg: "-0.03125927224755287"
+    arg: "0.13248908519744873" arg: "-0.8433090448379517" arg: "0.6264944076538086" arg: "-0.013466065749526024" arg: "-0.480976939201355" arg: "0.4193423092365265" arg: "-0.744616687297821" arg: "0.8992355465888977" arg: "-0.3339115083217621" arg: "-0.22039049863815308" arg: "0.18902333080768585" arg: "-0.39615511894226074" arg: "-0.010607750155031681" arg: "0.20654703676700592" arg: "0.25115591287612915" arg: "-0.2122495174407959" arg: "0.025297891348600388" arg: "0.35302531719207764" arg: "-0.0321347676217556" arg: "-0.3839147686958313"
+    arg: "-0.10077176988124847" arg: "-0.45120227336883545" arg: "0.3416426479816437" arg: "0.07022065669298172" arg: "-0.6492688059806824" arg: "-0.010763137601315975" arg: "-0.041585713624954224" arg: "0.45695760846138" arg: "0.4389442801475525" arg: "0.07174579054117203" arg: "0.19659492373466492" arg: "-0.2505846619606018" arg: "-0.5589239001274109" arg: "0.4465855360031128" arg: "0.8945375680923462" arg: "0.47595348954200745" arg: "0.01687660627067089" arg: "0.05361022800207138" arg: "-0.4034039378166199" arg: "-0.15716853737831116"
+    arg: "0.37725311517715454" arg: "-0.21682827174663544" arg: "-0.3331523537635803" arg: "0.4478318691253662" arg: "0.04937843605875969" arg: "0.11571618914604187" arg: "-0.31114915013313293" arg: "0.5024285316467285" arg: "-0.045635756105184555" arg: "0.0683443546295166" arg: "0.3868362605571747" arg: "0.020012596622109413" arg: "-0.17692965269088745" arg: "-0.09838074445724487" arg: "0.4211101830005646" arg: "0.04282836988568306" arg: "-0.002688082167878747" arg: "-0.04299991950392723" arg: "-0.4223831295967102" arg: "-0.4769083559513092"
+    arg: "-0.06549199670553207" arg: "0.3752592206001282" arg: "0.03317650780081749" arg: "0.5872426629066467" arg: "-0.1182107925415039" arg: "-0.10290710628032684" arg: "0.10983741283416748" arg: "0.1915282905101776" arg: "0.043863292783498764" arg: "0.2396492213010788" arg: "0.02648579329252243" arg: "-0.5836915969848633" arg: "0.04271770641207695" arg: "-0.07343849539756775" arg: "0.1856769174337387" arg: "-0.1381441354751587" arg: "0.006854575593024492" arg: "-0.12527717649936676" arg: "0.5925910472869873" arg: "-0.1231672465801239"
+    arg: "-0.06842511147260666" arg: "0.35675689578056335" arg: "-0.07970980554819107" arg: "0.09421294182538986" arg: "0.0697932317852974" arg: "0.23825913667678833" arg: "0.2665153443813324" arg: "-1.03485107421875" arg: "0.06772775202989578" arg: "0.06777352839708328" arg: "0.046700987964868546" arg: "0.2833155393600464" arg: "0.41568082571029663" arg: "-0.3085348904132843" arg: "-0.10529476404190063" arg: "-0.15378959476947784" arg: "0.1103232204914093" arg: "-0.10541176050901413" arg: "0.41803064942359924" arg: "0.3228841722011566"
+  }
+}
+operand {
+  name: "recurrent_to_forget_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "0.06594990938901901" arg: "-0.06807135045528412" arg: "-0.4602802097797394" arg: "0.35252904891967773" arg: "-0.008028666488826275" arg: "-0.03798177093267441" arg: "0.1020055040717125" arg: "-0.3012649416923523" arg: "-0.4212363660335541" arg: "-0.03982044756412506" arg: "-0.050789862871170044" arg: "0.27053192257881165" arg: "-0.013990324921905994" arg: "-0.0896860808134079" arg: "0.1068492904305458" arg: "0.015795979648828506" arg: "-0.1659027636051178" arg: "-0.15145719051361084" arg: "-0.2084323912858963" arg: "0.057955797761678696"
+    arg: "-0.23830100893974304" arg: "-0.003962437156587839" arg: "-0.45242246985435486" arg: "0.09086526930332184" arg: "0.19679458439350128" arg: "-0.1694013625383377" arg: "0.009338092990219593" arg: "0.15751825273036957" arg: "-0.015515184961259365" arg: "0.20204924046993256" arg: "-0.09540849179029465" arg: "-0.04590551182627678" arg: "-0.24671320617198944" arg: "-0.27835050225257874" arg: "0.13569574058055878" arg: "0.40812498331069946" arg: "-0.1699746996164322" arg: "-0.1630825698375702" arg: "-0.07755500078201294" arg: "0.06263996660709381"
+    arg: "0.23931466042995453" arg: "-0.029721643775701523" arg: "-0.2644506096839905" arg: "-0.33931082487106323" arg: "0.19949766993522644" arg: "0.22016771137714386" arg: "0.2121492326259613" arg: "-0.15880468487739563" arg: "0.24859464168548584" arg: "-0.2104686051607132" arg: "-0.23938705027103424" arg: "0.0718555748462677" arg: "-0.349223256111145" arg: "-0.09349290281534195" arg: "-0.0252213254570961" arg: "0.06639551371335983" arg: "-0.046982403844594955" arg: "0.118290975689888" arg: "-0.054646264761686325" arg: "0.32363349199295044"
+    arg: "0.034989047795534134" arg: "-0.08760252594947815" arg: "0.21847350895404816" arg: "-0.4630540907382965" arg: "0.39738836884498596" arg: "-0.040741026401519775" arg: "-0.43425223231315613" arg: "-0.19687709212303162" arg: "-0.14320023357868195" arg: "-0.2363135814666748" arg: "-0.1969219595193863" arg: "-0.20639191567897797" arg: "0.12733085453510284" arg: "-0.3910582959651947" arg: "-0.44535571336746216" arg: "-0.1720532774925232" arg: "0.013997661881148815" arg: "0.3502558469772339" arg: "0.2705589830875397" arg: "0.23238658905029297"
+    arg: "-0.03908773139119148" arg: "0.011227560229599476" arg: "0.4438604414463043" arg: "0.044431619346141815" arg: "-0.14284561574459076" arg: "0.17171142995357513" arg: "0.39687684178352356" arg: "-0.04913221672177315" arg: "0.1656711995601654" arg: "0.06323841214179993" arg: "0.07228634506464005" arg: "-0.045438170433044434" arg: "0.03332178294658661" arg: "0.020702671259641647" arg: "-0.02522851713001728" arg: "-0.014919421635568142" arg: "-0.17055924236774445" arg: "0.027965081855654716" arg: "-0.2815409302711487" arg: "-0.24543267488479614"
+    arg: "-0.09287944436073303" arg: "0.019474849104881287" arg: "0.09233348816633224" arg: "-0.055800918489694595" arg: "0.2498578131198883" arg: "0.042439963668584824" arg: "-0.13415414094924927" arg: "0.5308498740196228" arg: "0.3631361126899719" arg: "-0.19753535091876984" arg: "-0.32000917196273804" arg: "0.01533063966780901" arg: "-0.46180611848831177" arg: "0.042543888092041016" arg: "-0.14118818938732147" arg: "0.03781934827566147" arg: "0.26562583446502686" arg: "0.22630012035369873" arg: "-0.2770325839519501" arg: "-0.16677603125572205"
+    arg: "-0.2678513824939728" arg: "-0.10348694026470184" arg: "-0.0867864191532135" arg: "0.06361433863639832" arg: "0.014271223917603493" arg: "-0.12187133729457855" arg: "0.03993409126996994" arg: "0.028004109859466553" arg: "-0.2755664885044098" arg: "-0.04777361452579498" arg: "0.04847913607954979" arg: "-0.1460455060005188" arg: "0.23377567529678345" arg: "-0.22863848507404327" arg: "-0.14881330728530884" arg: "-0.132281094789505" arg: "0.13625505566596985" arg: "0.18803201615810394" arg: "-0.06886259466409683" arg: "0.006628264673054218"
+    arg: "-0.18729551136493683" arg: "-0.45488521456718445" arg: "0.24042674899101257" arg: "-0.26106804609298706" arg: "0.08683270215988159" arg: "-0.10342814028263092" arg: "-0.07613679021596909" arg: "0.21038036048412323" arg: "-0.2822016179561615" arg: "-0.008857419714331627" arg: "-0.22856365144252777" arg: "0.25842568278312683" arg: "0.03938071057200432" arg: "0.27398109436035156" arg: "0.00563707435503602" arg: "0.04277018457651138" arg: "0.008633948862552643" arg: "0.18542668223381042" arg: "-0.0034568854607641697" arg: "-0.05330372974276543"
+    arg: "0.234075129032135" arg: "-0.0890774056315422" arg: "0.5264164805412292" arg: "-0.27344802021980286" arg: "-0.27058038115501404" arg: "0.01804373227059841" arg: "-0.30603522062301636" arg: "0.24496032297611237" arg: "0.013210487551987171" arg: "-0.07897075265645981" arg: "0.022406281903386116" arg: "0.0693880245089531" arg: "0.015409570187330246" arg: "-0.0077253966592252254" arg: "-0.16814486682415009" arg: "0.13719123601913452" arg: "0.1842775046825409" arg: "0.23263384401798248" arg: "-0.2894793450832367" arg: "-0.0586865171790123"
+    arg: "0.16423609852790833" arg: "0.21610338985919952" arg: "-0.004741444252431393" arg: "-0.06914521008729935" arg: "0.01680983044207096" arg: "-0.15836265683174133" arg: "0.22845181822776794" arg: "0.05326155573129654" arg: "-0.1423141062259674" arg: "0.3005701005458832" arg: "-0.4672607183456421" arg: "-0.023611126467585564" arg: "0.0609925203025341" arg: "0.10335300117731094" arg: "-0.06812556833028793" arg: "-0.07960234582424164" arg: "-0.09673355519771576" arg: "-0.002028367016464472" arg: "0.2790486216545105" arg: "0.16641387343406677"
+    arg: "0.17312130331993103" arg: "-0.12452541291713715" arg: "0.32220134139060974" arg: "-0.22103938460350037" arg: "0.030806513503193855" arg: "-0.1734783947467804" arg: "-0.15974026918411255" arg: "-0.14125876128673553" arg: "0.2410593330860138" arg: "-0.5250580310821533" arg: "-0.04526519402861595" arg: "0.32462355494499207" arg: "0.06616294384002686" arg: "-0.11993109434843063" arg: "0.048295579850673676" arg: "0.042772307991981506" arg: "-0.03536328673362732" arg: "0.3457142114639282" arg: "0.12001463770866394" arg: "0.22245542705059052"
+    arg: "0.02782665565609932" arg: "0.07261228561401367" arg: "-0.33622199296951294" arg: "0.23261497914791107" arg: "0.04636847600340843" arg: "-0.027293216437101364" arg: "0.17709527909755707" arg: "0.018871014937758446" arg: "-0.4241866171360016" arg: "-0.07660052180290222" arg: "-0.3715123236179352" arg: "0.21518565714359283" arg: "0.3018551170825958" arg: "-0.2709880769252777" arg: "-0.1473710685968399" arg: "-0.2565970718860626" arg: "-0.2993161678314209" arg: "0.1733904629945755" arg: "-0.2439367175102234" arg: "0.26016315817832947"
+    arg: "0.21543648838996887" arg: "0.02984066680073738" arg: "0.2857840359210968" arg: "-0.09354538470506668" arg: "0.3686164617538452" arg: "-0.17137302458286285" arg: "-0.13334709405899048" arg: "0.15443699061870575" arg: "-0.3447284698486328" arg: "-0.0766822099685669" arg: "-0.18963581323623657" arg: "-0.07595658302307129" arg: "0.04707604646682739" arg: "-0.23405563831329346" arg: "0.05423225834965706" arg: "-0.23418886959552765" arg: "-0.03189626708626747" arg: "0.2605202794075012" arg: "0.05496497079730034" arg: "0.173336461186409"
+    arg: "0.27640455961227417" arg: "-0.24286918342113495" arg: "-0.24134227633476257" arg: "-0.15636584162712097" arg: "0.2677306830883026" arg: "-0.2062496393918991" arg: "0.32234105467796326" arg: "-0.24469925463199615" arg: "-0.3751060664653778" arg: "-0.23786574602127075" arg: "0.03635139390826225" arg: "0.12451396137475967" arg: "0.26129764318466187" arg: "-0.12637533247470856" arg: "-0.0780411958694458" arg: "0.06617061048746109" arg: "-0.25668978691101074" arg: "0.23007889091968536" arg: "-0.08478987962007523" arg: "0.220413938164711"
+    arg: "-0.10401985049247742" arg: "0.0647420585155487" arg: "0.09111618995666504" arg: "-0.04593143239617348" arg: "0.08350320905447006" arg: "0.023905832320451736" arg: "0.16202807426452637" arg: "0.25432881712913513" arg: "0.17261511087417603" arg: "-0.011524937115609646" arg: "0.07423079758882523" arg: "0.033635564148426056" arg: "0.014234645292162895" arg: "-0.2424505203962326" arg: "0.14718832075595856" arg: "0.14837898313999176" arg: "0.04802917316555977" arg: "0.059234943240880966" arg: "0.2068481594324112" arg: "-0.09739648550748825"
+    arg: "-0.026647251099348068" arg: "-0.32062458992004395" arg: "0.16718854010105133" arg: "-0.32557788491249084" arg: "0.3088855445384979" arg: "-0.1289512664079666" arg: "-0.042579133063554764" arg: "-0.27093860507011414" arg: "-0.2899383306503296" arg: "-0.3538142442703247" arg: "0.4299084544181824" arg: "0.0619647242128849" arg: "0.5301066637039185" arg: "0.08283061534166336" arg: "-0.043685220181941986" arg: "-0.2241324931383133" arg: "-0.3621082305908203" arg: "0.014637312851846218" arg: "-0.6699166893959045" arg: "0.1908542513847351"
+    arg: "0.09265121817588806" arg: "-0.1539815366268158" arg: "0.07899756729602814" arg: "-0.04436815530061722" arg: "-0.2454068809747696" arg: "-0.18386529386043549" arg: "-0.14677776396274567" arg: "0.6323122978210449" arg: "0.39544856548309326" arg: "0.15971237421035767" arg: "0.10913989692926407" arg: "-0.042497217655181885" arg: "-0.23099665343761444" arg: "0.04052138328552246" arg: "0.34883034229278564" arg: "0.0847955048084259" arg: "-0.08281111717224121" arg: "-0.061811413615942" arg: "-0.21920911967754364" arg: "-0.08061020076274872"
+    arg: "0.1676846295595169" arg: "-0.18172425031661987" arg: "0.2923012375831604" arg: "-0.14758338034152985" arg: "0.40604183077812195" arg: "0.1405867487192154" arg: "-0.23895759880542755" arg: "-0.12314226478338242" arg: "-0.25169745087623596" arg: "-0.31885266304016113" arg: "-0.07341024279594421" arg: "0.0072786142118275166" arg: "-0.0969509556889534" arg: "-0.2571040391921997" arg: "-0.33312639594078064" arg: "-0.2451372891664505" arg: "0.1654350608587265" arg: "0.033568061888217926" arg: "0.014660677872598171" arg: "0.377450555562973"
+    arg: "0.11867852509021759" arg: "0.0411519892513752" arg: "-0.4462774097919464" arg: "0.1362692266702652" arg: "0.14434905350208282" arg: "0.045803915709257126" arg: "-0.07773952186107635" arg: "0.27392011880874634" arg: "-0.14940162003040314" arg: "0.055528268218040466" arg: "-0.5712833404541016" arg: "-0.12384487688541412" arg: "-0.16526257991790771" arg: "-0.14264139533042908" arg: "-0.144387885928154" arg: "0.26574134826660156" arg: "-0.17008106410503387" arg: "0.18583066761493683" arg: "-0.17407818138599396" arg: "0.0841611996293068"
+    arg: "0.07560589909553528" arg: "0.10793375223875046" arg: "0.1858903020620346" arg: "0.17929036915302277" arg: "-0.014105351641774178" arg: "0.07215336710214615" arg: "0.034808021038770676" arg: "-0.023832565173506737" arg: "-0.005115351639688015" arg: "0.03793272748589516" arg: "-0.06749884784221649" arg: "-0.2857394814491272" arg: "-0.22204333543777466" arg: "0.07521218806505203" arg: "-0.22578758001327515" arg: "0.10578799247741699" arg: "-0.0599808394908905" arg: "-0.03470684587955475" arg: "0.04690929129719734" arg: "0.009294633753597736"
+  }
+}
+operand {
+  name: "recurrent_to_cell_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "0.15393993258476257" arg: "-0.1119932010769844" arg: "0.18920856714248657" arg: "0.1281609982252121" arg: "-0.12339968234300613" arg: "-0.08566756546497345" arg: "-0.13214115798473358" arg: "-0.0587150976061821" arg: "-0.1808837354183197" arg: "-0.08994600921869278" arg: "0.18792179226875305" arg: "0.31002986431121826" arg: "-0.23494939506053925" arg: "-0.13358882069587708" arg: "0.21817533671855927" arg: "-0.1144614890217781" arg: "-0.032731179147958755" arg: "0.1316293478012085" arg: "-0.21290943026542664" arg: "0.033711988478899"
+    arg: "0.2093488723039627" arg: "0.009070725180208683" arg: "-0.34243470430374146" arg: "0.2588043808937073" arg: "-0.12327506393194199" arg: "-0.06977886706590652" arg: "0.18403127789497375" arg: "-0.037799157202243805" arg: "-0.10396076738834381" arg: "0.4311140179634094" arg: "-0.16276852786540985" arg: "0.3511127233505249" arg: "0.1728871464729309" arg: "-0.3596697151660919" arg: "-0.029892150312662125" arg: "-0.13553234934806824" arg: "0.03372793272137642" arg: "-0.3119524121284485" arg: "0.04722945764660835" arg: "-0.0335264652967453"
+    arg: "-0.19715268909931183" arg: "0.18051989376544952" arg: "-0.25875094532966614" arg: "-0.2308073490858078" arg: "0.12977799773216248" arg: "0.11133529990911484" arg: "-0.10884438455104828" arg: "-0.006393382791429758" arg: "-0.0046616485342383385" arg: "0.07372598350048065" arg: "-0.17514462769031525" arg: "-0.06907986104488373" arg: "-0.1077096164226532" arg: "-0.2481498420238495" arg: "-0.14520783722400665" arg: "-0.06911041587591171" arg: "-0.02821161597967148" arg: "0.14755520224571228" arg: "0.3900660574436188" arg: "0.1893186867237091"
+    arg: "0.09458756446838379" arg: "0.054943062365055084" arg: "0.4107792377471924" arg: "0.05842319130897522" arg: "0.09731859713792801" arg: "-0.06020563840866089" arg: "0.2529062032699585" arg: "-0.2720320224761963" arg: "-0.10796058923006058" arg: "-0.08254134654998779" arg: "-0.07210174947977066" arg: "-0.36896562576293945" arg: "0.08478402346372604" arg: "0.15760378539562225" arg: "-0.12006833404302597" arg: "0.008080476894974709" arg: "0.3506588339805603" arg: "0.25160735845565796" arg: "0.08777479827404022" arg: "0.273798406124115"
+    arg: "0.01723896898329258" arg: "0.11717648804187775" arg: "-0.2846356928348541" arg: "-0.07879329472780228" arg: "0.2186465710401535" arg: "0.09551840275526047" arg: "0.04083137586712837" arg: "-0.0763259083032608" arg: "-0.17741726338863373" arg: "-0.22416481375694275" arg: "0.08309032022953033" arg: "-0.07316568493843079" arg: "-0.004594864323735237" arg: "0.04726291820406914" arg: "-0.060947902500629425" arg: "-0.02379523031413555" arg: "0.1387377828359604" arg: "0.0520065538585186" arg: "-0.009234771132469177" arg: "0.20820368826389313"
+    arg: "0.2090202122926712" arg: "0.09088768064975739" arg: "0.0712779089808464" arg: "0.4721727669239044" arg: "-0.2852536737918854" arg: "-0.030319523066282272" arg: "-0.15199345350265503" arg: "0.03313468396663666" arg: "0.130229651927948" arg: "0.11190740019083023" arg: "-0.033711377531290054" arg: "0.26203152537345886" arg: "0.1747232973575592" arg: "0.06380274146795273" arg: "0.10935788601636887" arg: "0.03934641182422638" arg: "-0.24481335282325745" arg: "-0.2071755826473236" arg: "0.21853256225585938" arg: "-0.05010126531124115"
+    arg: "-0.08034007996320724" arg: "0.06709744036197662" arg: "0.15941183269023895" arg: "0.3035742938518524" arg: "0.06431770324707031" arg: "0.22864562273025513" arg: "0.2153673619031906" arg: "-0.16501116752624512" arg: "0.08141324669122696" arg: "-0.1909857988357544" arg: "0.09936768561601639" arg: "0.05850536748766899" arg: "-0.007407554890960455" arg: "-0.0750204399228096" arg: "-0.011765131726861" arg: "-0.1525736153125763" arg: "0.2009558528661728" arg: "0.057866111397743225" arg: "-0.3028014302253723" arg: "0.0406017005443573"
+    arg: "0.3206914961338043" arg: "0.013377382420003414" arg: "0.08953910320997238" arg: "0.1381121575832367" arg: "-0.19908195734024048" arg: "-0.20327427983283997" arg: "0.10023070126771927" arg: "0.0965537428855896" arg: "0.09599238634109497" arg: "0.012801108881831169" arg: "-0.008848292753100395" arg: "-0.0921083316206932" arg: "-0.2611875534057617" arg: "0.20738714933395386" arg: "0.18287070095539093" arg: "0.06643958389759064" arg: "0.08770095556974411" arg: "-0.20850636065006256" arg: "-0.037345774471759796" arg: "-0.3267252743244171"
+    arg: "0.04550359770655632" arg: "0.1751193106174469" arg: "0.0021270427387207747" arg: "-0.12569129467010498" arg: "0.09540387243032455" arg: "0.2459857165813446" arg: "0.24060799181461334" arg: "-0.00685726385563612" arg: "-0.07234424352645874" arg: "0.09571491926908493" arg: "-0.3154931366443634" arg: "0.13833951950073242" arg: "0.17639833688735962" arg: "0.3401899039745331" arg: "-0.25327083468437195" arg: "0.003526201006025076" arg: "0.30618157982826233" arg: "-0.31580427289009094" arg: "0.03264538198709488" arg: "-0.102194644510746"
+    arg: "0.14761067926883698" arg: "0.02882370911538601" arg: "0.08630412817001343" arg: "0.18815916776657104" arg: "-0.17160621285438538" arg: "-0.3442608118057251" arg: "-0.12482235580682755" arg: "-0.3455544710159302" arg: "-0.0847967267036438" arg: "0.16379626095294952" arg: "0.11732957512140274" arg: "0.18391959369182587" arg: "0.2112390100955963" arg: "0.15884174406528473" arg: "-0.1210162416100502" arg: "0.03699047863483429" arg: "0.07807657867670059" arg: "-0.14232687652111053" arg: "0.007268161047250032" arg: "0.05068487673997879"
+    arg: "0.04104536026716232" arg: "0.201126366853714" arg: "0.19676734507083893" arg: "-0.03136518597602844" arg: "0.057088401168584824" arg: "0.217696413397789" arg: "0.0404636487364769" arg: "-0.16933280229568481" arg: "0.11017945408821106" arg: "0.16551776230335236" arg: "0.15519888699054718" arg: "0.20411789417266846" arg: "0.17852722108364105" arg: "0.24985377490520477" arg: "0.03789833188056946" arg: "-0.19242724776268005" arg: "0.0679841935634613" arg: "-0.08297871053218842" arg: "-0.017301911488175392" arg: "-0.07224911451339722"
+    arg: "-0.15029805898666382" arg: "-0.022065505385398865" arg: "0.3310281038284302" arg: "-0.0074359094724059105" arg: "0.1291237622499466" arg: "-0.3258497416973114" arg: "-0.020609457045793533" arg: "0.07960690557956696" arg: "0.18422964215278625" arg: "-0.015700064599514008" arg: "-0.0406377948820591" arg: "0.07060065865516663" arg: "-0.05204642191529274" arg: "-0.0752851590514183" arg: "0.29478275775909424" arg: "0.09939233958721161" arg: "-0.1349070519208908" arg: "0.08028685301542282" arg: "-0.1612706184387207" arg: "-0.3498779833316803"
+    arg: "0.09452734142541885" arg: "-0.14093227684497833" arg: "0.1231885701417923" arg: "-0.17776770889759064" arg: "-0.21792110800743103" arg: "0.008278626017272472" arg: "-0.11924610286951065" arg: "0.12319722771644592" arg: "0.09757496416568756" arg: "0.3845261037349701" arg: "0.06491772085428238" arg: "0.02287365309894085" arg: "0.10664971172809601" arg: "-0.2075091153383255" arg: "0.16306491196155548" arg: "-0.09945328533649445" arg: "0.03647858276963234" arg: "0.31740331649780273" arg: "-0.0575806088745594" arg: "-0.15998433530330658"
+    arg: "0.05049542337656021" arg: "0.10578017681837082" arg: "0.4129166305065155" arg: "-0.26145657896995544" arg: "0.09070956707000732" arg: "0.13368085026741028" arg: "0.047888197004795074" arg: "-0.26703593134880066" arg: "0.03738849610090256" arg: "0.0096968412399292" arg: "-0.02515929564833641" arg: "-0.08761339634656906" arg: "-0.08801304548978806" arg: "0.11130105704069138" arg: "-0.1670377254486084" arg: "-0.38100311160087585" arg: "0.08594627678394318" arg: "0.15826018154621124" arg: "0.15142755210399628" arg: "0.3665761351585388"
+    arg: "-0.2884967029094696" arg: "0.034480463713407516" arg: "0.080620676279068" arg: "0.0942501500248909" arg: "-0.17697006464004517" arg: "-0.04557788744568825" arg: "0.12102261930704117" arg: "-0.08095056563615799" arg: "0.20667794346809387" arg: "0.0005014429334551096" arg: "-0.24826794862747192" arg: "-0.06553015112876892" arg: "-0.24456636607646942" arg: "-0.004251034930348396" arg: "-0.02371463179588318" arg: "0.13635343313217163" arg: "0.1759263128042221" arg: "-0.2496115118265152" arg: "0.3222438097000122" arg: "-0.06805617362260818"
+    arg: "0.3151903748512268" arg: "-0.07969710230827332" arg: "-0.338588684797287" arg: "0.09406647831201553" arg: "-0.22847072780132294" arg: "-0.13792040944099426" arg: "0.2320234179496765" arg: "-0.10531327873468399" arg: "-0.21394342184066772" arg: "-0.05493509769439697" arg: "0.0776442140340805" arg: "-0.07514091581106186" arg: "0.23195593059062958" arg: "-0.016244227066636086" arg: "-0.12812721729278564" arg: "0.1941227912902832" arg: "-0.08763367682695389" arg: "-0.17611214518547058" arg: "0.05966200307011604" arg: "0.03175244480371475"
+    arg: "-0.3181533217430115" arg: "0.14994001388549805" arg: "-0.0519041046500206" arg: "0.10318135470151901" arg: "0.12232168763875961" arg: "0.12503929436206818" arg: "-0.042770031839609146" arg: "0.028692282736301422" arg: "0.041286103427410126" arg: "0.008777778595685959" arg: "-0.15041261911392212" arg: "-0.3636454939842224" arg: "-0.2648666799068451" arg: "0.12697425484657288" arg: "-0.04115947335958481" arg: "-0.1794285625219345" arg: "0.3467434346675873" arg: "0.09371137619018555" arg: "0.39284154772758484" arg: "0.10154542326927185"
+    arg: "0.03654832765460014" arg: "-0.0001514707983005792" arg: "0.05111170932650566" arg: "-0.43743401765823364" arg: "0.19728508591651917" arg: "0.10978388041257858" arg: "-0.2930853068828583" arg: "0.31976282596588135" arg: "0.07013546675443649" arg: "0.045205868780612946" arg: "0.12697115540504456" arg: "-0.17158143222332" arg: "-0.4531923532485962" arg: "-0.03989870846271515" arg: "0.028936142101883888" arg: "0.16511154174804688" arg: "-0.10384245961904526" arg: "-0.14950263500213623" arg: "0.10117360949516296" arg: "-0.1518079936504364"
+    arg: "0.19067397713661194" arg: "-0.011113183572888374" arg: "-0.3417884409427643" arg: "0.005517064593732357" arg: "-0.041157711297273636" arg: "-0.3314608931541443" arg: "0.012132381089031696" arg: "-0.034892488270998" arg: "-0.07512284815311432" arg: "0.1292932778596878" arg: "-0.22919918596744537" arg: "0.2461051344871521" arg: "0.22426217794418335" arg: "0.006595896556973457" arg: "-0.1439153254032135" arg: "-0.26290032267570496" arg: "-0.1798022985458374" arg: "-0.12984399497509003" arg: "0.065561443567276" arg: "-0.06624792516231537"
+    arg: "-0.19407491385936737" arg: "-0.0409831777215004" arg: "-0.008038614876568317" arg: "-0.03238639608025551" arg: "-0.04466156288981438" arg: "0.1601162552833557" arg: "-0.052234116941690445" arg: "0.15304076671600342" arg: "0.000367005035514012" arg: "0.022618206217885017" arg: "0.15538087487220764" arg: "0.25994208455085754" arg: "0.12255910038948059" arg: "-0.13710808753967285" arg: "-0.016998453065752983" arg: "0.2590976059436798" arg: "-0.011389931663870811" arg: "-0.034473538398742676" arg: "0.05619646608829498" arg: "0.08633460104465485"
+  }
+}
+operand {
+  name: "recurrent_to_output_weights"
+  type: FLOAT32
+  shape { dim: 20 dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "-0.35909947752952576" arg: "0.12187856435775757" arg: "0.14746889472007751" arg: "0.3129103481769562" arg: "-0.5547925233840942" arg: "-0.26812028884887695" arg: "0.2384958267211914" arg: "-0.47153329849243164" arg: "0.07567869871854782" arg: "0.28245386481285095" arg: "0.4810175597667694" arg: "0.031078442931175232" arg: "0.39412668347358704" arg: "0.001231769216246903" arg: "-0.0179451797157526" arg: "-0.3484187126159668" arg: "-0.1315481960773468" arg: "-0.19073595106601715" arg: "0.2959749102592468" arg: "-0.15430164337158203"
+    arg: "0.16478729248046875" arg: "-0.18185187876224518" arg: "-0.42523953318595886" arg: "0.28228330612182617" arg: "-0.5983712077140808" arg: "-0.31367194652557373" arg: "-0.3297293186187744" arg: "0.1790262758731842" arg: "-0.06721899658441544" arg: "0.27287885546684265" arg: "0.1248977854847908" arg: "0.529021680355072" arg: "-0.3688035011291504" arg: "-0.2915802299976349" arg: "0.5874091982841492" arg: "0.6444711685180664" arg: "-0.5507888793945312" arg: "-0.4991227984428406" arg: "-0.5332760810852051" arg: "-0.17446967959403992"
+    arg: "-0.10843317955732346" arg: "-0.2629949450492859" arg: "0.21471929550170898" arg: "0.09879318624734879" arg: "-0.0769701600074768" arg: "-0.23829951882362366" arg: "-0.14967726171016693" arg: "0.1522980034351349" arg: "0.0016457909950986505" arg: "0.07313574850559235" arg: "0.359075129032135" arg: "-0.29160916805267334" arg: "-0.1623256802558899" arg: "0.3452284634113312" arg: "0.11389480531215668" arg: "0.056126005947589874" arg: "0.1680738776922226" arg: "0.054511312395334244" arg: "-0.3061401844024658" arg: "-0.37002867460250854"
+    arg: "0.8169177174568176" arg: "-0.10186938941478729" arg: "0.035952117294073105" arg: "0.31021371483802795" arg: "-0.3045564293861389" arg: "0.16454839706420898" arg: "-0.007755322381854057" arg: "0.3747217655181885" arg: "0.028079498559236526" arg: "0.6176130771636963" arg: "0.3060242235660553" arg: "-0.1116616427898407" arg: "-0.0698426142334938" arg: "0.02596282958984375" arg: "0.40301159024238586" arg: "0.22842562198638916" arg: "-0.7979361414909363" arg: "-0.21555794775485992" arg: "0.22447574138641357" arg: "-0.09180140495300293"
+    arg: "0.6331534385681152" arg: "-0.07374905049800873" arg: "0.0910644456744194" arg: "0.7608184218406677" arg: "-0.05133755877614021" arg: "-0.2353716641664505" arg: "0.7358492612838745" arg: "-0.2672101557254791" arg: "0.23726515471935272" arg: "0.21083518862724304" arg: "0.39143991470336914" arg: "0.4840562045574188" arg: "0.42181020975112915" arg: "-0.02092970348894596" arg: "-0.00017688501975499094" arg: "-0.02578321099281311" arg: "-0.33251720666885376" arg: "-0.2569231390953064" arg: "-0.27487626671791077" arg: "-0.30479907989501953"
+    arg: "0.12656816840171814" arg: "-0.35632675886154175" arg: "0.05882206931710243" arg: "0.1031598150730133" arg: "-0.07713407278060913" arg: "0.06100684776902199" arg: "-0.1301981508731842" arg: "0.18870045244693756" arg: "0.34141841530799866" arg: "-0.5403046011924744" arg: "0.36341556906700134" arg: "0.4443875551223755" arg: "0.11223088949918747" arg: "0.07594747096300125" arg: "0.17846737802028656" arg: "0.30533328652381897" arg: "-0.22884678840637207" arg: "-0.2669167220592499" arg: "-0.7768117189407349" arg: "-0.37001490592956543"
+    arg: "0.1587837189435959" arg: "-0.191584974527359" arg: "-0.6631219387054443" arg: "0.06661315262317657" arg: "-0.4396548271179199" arg: "-0.4596345126628876" arg: "0.1870720386505127" arg: "0.1951659470796585" arg: "-0.08539465069770813" arg: "0.3470593988895416" arg: "0.19268564879894257" arg: "-0.046538181602954865" arg: "0.01603168435394764" arg: "-0.07825833559036255" arg: "0.09992441534996033" arg: "0.08686434477567673" arg: "-0.38704144954681396" arg: "-0.47036734223365784" arg: "0.0524212010204792" arg: "-0.372523695230484"
+    arg: "0.006748664658516645" arg: "-0.08730413764715195" arg: "-0.16790643334388733" arg: "0.31276655197143555" arg: "-0.44439390301704407" arg: "-0.23598124086856842" arg: "0.0694812685251236" arg: "0.38802817463874817" arg: "-0.012220374308526516" arg: "0.38149744272232056" arg: "-0.03641294315457344" arg: "0.0744020864367485" arg: "-0.08323682844638824" arg: "0.11382298171520233" arg: "0.2919921278953552" arg: "0.31642037630081177" arg: "-0.401195764541626" arg: "0.09580203890800476" arg: "-0.1458958089351654" arg: "-0.3990739583969116"
+    arg: "-0.017351288348436356" arg: "-0.15279312431812286" arg: "0.21107575297355652" arg: "0.23132845759391785" arg: "0.12567712366580963" arg: "0.0009088824735954404" arg: "-0.5392304062843323" arg: "-0.503669023513794" arg: "0.1523285210132599" arg: "0.2695973813533783" arg: "0.2366502732038498" arg: "0.3115360140800476" arg: "-0.3943549692630768" arg: "0.6869263648986816" arg: "0.20123623311519623" arg: "-0.003731918754056096" arg: "0.2607108950614929" arg: "-0.3499254584312439" arg: "-0.004152949899435043" arg: "-0.1376078873872757"
+    arg: "0.4573622941970825" arg: "0.008549842983484268" arg: "0.1646938920021057" arg: "-0.15896114706993103" arg: "-0.4295574128627777" arg: "0.06403962522745132" arg: "-0.012177926488220692" arg: "0.5018934607505798" arg: "0.0375320166349411" arg: "0.43595317006111145" arg: "-0.05773438140749931" arg: "0.13049593567848206" arg: "-0.1468954086303711" arg: "-0.4093998372554779" arg: "0.4959154427051544" arg: "0.7173134684562683" arg: "-0.5174667239189148" arg: "-0.16707409918308258" arg: "-0.06118558719754219" arg: "-0.11275004595518112"
+    arg: "0.08968205004930496" arg: "0.3198257088661194" arg: "0.07224604487419128" arg: "0.5600743889808655" arg: "0.024834752082824707" arg: "-0.02439100854098797" arg: "-0.1513833850622177" arg: "0.13906888663768768" arg: "0.06407716870307922" arg: "0.5332576036453247" arg: "0.24956916272640228" arg: "-0.044385701417922974" arg: "-0.4433465301990509" arg: "-0.19094131886959076" arg: "0.4768398106098175" arg: "0.21503591537475586" arg: "-0.218861386179924" arg: "-0.4321509003639221" arg: "-0.24130387604236603" arg: "-0.07977084070444107"
+    arg: "0.002716424874961376" arg: "-0.1713045984506607" arg: "-0.12604790925979614" arg: "-0.03560760244727135" arg: "-0.5757992267608643" arg: "-0.1557251513004303" arg: "-0.05827505886554718" arg: "0.3337538540363312" arg: "-0.4115898907184601" arg: "0.5126633048057556" arg: "0.14806263148784637" arg: "0.40081098675727844" arg: "0.8833869695663452" arg: "-0.19723086059093475" arg: "0.09533816576004028" arg: "0.03869156911969185" arg: "-0.2973725199699402" arg: "0.022853707894682884" arg: "-0.0228166151791811" arg: "-0.4052131772041321"
+    arg: "0.12930545210838318" arg: "0.01575206033885479" arg: "-0.21314911544322968" arg: "0.5510196685791016" arg: "0.06540991365909576" arg: "-0.07084762305021286" arg: "0.3234975337982178" arg: "0.19345852732658386" arg: "0.16359369456768036" arg: "-0.02992691472172737" arg: "0.07857825607061386" arg: "0.3506908714771271" arg: "0.16494658589363098" arg: "0.07570064812898636" arg: "0.32486459612846375" arg: "-0.14951008558273315" arg: "-0.022363830357789993" arg: "-0.42179420590400696" arg: "-0.24661937355995178" arg: "-0.08302409946918488"
+    arg: "0.2494393140077591" arg: "-0.12944501638412476" arg: "-0.010796070098876953" arg: "0.15976394712924957" arg: "-0.01106332242488861" arg: "0.25831347703933716" arg: "0.18664048612117767" arg: "-0.03495928645133972" arg: "0.01873226836323738" arg: "0.02704462595283985" arg: "0.1773315966129303" arg: "0.09905895590782166" arg: "0.137725368142128" arg: "-0.4195314347743988" arg: "0.20205777883529663" arg: "0.25744083523750305" arg: "-0.4343162178993225" arg: "0.08337675034999847" arg: "-0.24768808484077454" arg: "0.05348324030637741"
+    arg: "-0.07421242445707321" arg: "0.08401253819465637" arg: "0.24182510375976562" arg: "-0.19996227324008942" arg: "-0.26596978306770325" arg: "-0.10460428893566132" arg: "-0.09030365198850632" arg: "0.3622499406337738" arg: "0.32519716024398804" arg: "0.3067288398742676" arg: "-0.0695832222700119" arg: "-0.10316962748765945" arg: "-0.09733156114816666" arg: "0.4681766629219055" arg: "0.3733525574207306" arg: "-0.013295430690050125" arg: "-0.11883660405874252" arg: "-0.10412082821130753" arg: "0.05678151175379753" arg: "-0.11783196032047272"
+    arg: "0.048583026975393295" arg: "-0.9528340101242065" arg: "0.10752814263105392" arg: "0.273784339427948" arg: "0.23048622906208038" arg: "-0.2551514804363251" arg: "-0.21344983577728271" arg: "0.2589189112186432" arg: "-0.1326867789030075" arg: "-0.14273332059383392" arg: "0.11125936359167099" arg: "0.10763772577047348" arg: "-0.3638816177845001" arg: "0.6586386561393738" arg: "0.6191070675849915" arg: "0.2745305895805359" arg: "-0.21111124753952026" arg: "0.23943224549293518" arg: "-0.5838378667831421" arg: "-0.7447165250778198"
+    arg: "0.27415889501571655" arg: "-0.10696078091859818" arg: "-0.1905016303062439" arg: "0.17716637253761292" arg: "-0.17008160054683685" arg: "-0.38646024465560913" arg: "0.17075011134147644" arg: "-0.0971580222249031" arg: "0.36582818627357483" arg: "0.3553922176361084" arg: "0.3533395528793335" arg: "0.46518567204475403" arg: "0.12306690216064453" arg: "0.3765827715396881" arg: "0.27485108375549316" arg: "0.026894697919487953" arg: "-0.13947726786136627" arg: "-0.4675980508327484" arg: "0.000053708172345068306" arg: "-0.1514354646205902"
+    arg: "0.034218866378068924" arg: "-0.3962448537349701" arg: "-0.08128349483013153" arg: "0.10788826644420624" arg: "-0.3110845983028412" arg: "0.25610488653182983" arg: "-0.5693814754486084" arg: "0.6281890273094177" arg: "0.0010718648554757237" arg: "-0.21038493514060974" arg: "0.18425892293453217" arg: "-0.35341814160346985" arg: "-0.2984526455402374" arg: "0.29100173711776733" arg: "0.4346262514591217" arg: "-0.02309197559952736" arg: "0.06577077507972717" arg: "-0.24334858357906342" arg: "-0.34281492233276367" arg: "-0.4032599627971649"
+    arg: "0.34936246275901794" arg: "0.3322518467903137" arg: "-0.2656654119491577" arg: "0.22830642759799957" arg: "-0.11201204359531403" arg: "-0.1707642823457718" arg: "0.007749658077955246" arg: "0.43952593207359314" arg: "0.14750634133815765" arg: "0.42360368371009827" arg: "0.1105399876832962" arg: "-0.06718066334724426" arg: "-0.175845667719841" arg: "0.023229194805026054" arg: "0.35441142320632935" arg: "0.35180309414863586" arg: "-0.561530351638794" arg: "-0.1788090020418167" arg: "0.05351807549595833" arg: "-0.3240300118923187"
+    arg: "0.2829385995864868" arg: "0.09240324050188065" arg: "0.10970980674028397" arg: "1.01627779006958" arg: "-0.3717207908630371" arg: "-0.2776918113231659" arg: "0.6677582263946533" arg: "-0.2235853224992752" arg: "-0.06214175000786781" arg: "0.23073340952396393" arg: "0.3371483087539673" arg: "-0.029265087097883224" arg: "0.25156235694885254" arg: "0.43319517374038696" arg: "0.035503044724464417" arg: "0.12156634777784348" arg: "-0.24198615550994873" arg: "-0.42002007365226746" arg: "-0.11373946070671082" arg: "-0.28098201751708984"
+  }
+}
+operand {
+  name: "input_gate_bias"
+  type: FLOAT32
+  shape { dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "0.39238446950912476" arg: "-0.040046464651823044" arg: "0.13657712936401367" arg: "0.35934528708457947" arg: "0.321681946516037" arg: "0.0616583526134491" arg: "0.11477429419755936" arg: "0.20044274628162384" arg: "0.011154969222843647" arg: "0.24244074523448944" arg: "0.27598848938941956" arg: "0.4028998911380768" arg: "0.21931242942810059" arg: "0.3108941316604614" arg: "0.1841004192829132" arg: "0.14638805389404297" arg: "0.46200960874557495" arg: "0.24594353139400482" arg: "0.07526364177465439" arg: "-0.22416549921035767"
+  }
+}
+operand {
+  name: "forget_gate_bias"
+  type: FLOAT32
+  shape { dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "1.2047474384307861" arg: "1.2191035747528076" arg: "0.871356725692749" arg: "1.0395587682724" arg: "1.150162935256958" arg: "1.0623992681503296" arg: "1.0699368715286255" arg: "1.0769526958465576" arg: "1.1270850896835327" arg: "1.151424527168274" arg: "1.1118133068084717" arg: "1.150691032409668" arg: "0.9700227975845337" arg: "1.0458472967147827" arg: "1.0566719770431519" arg: "1.036710262298584" arg: "1.1118052005767822" arg: "0.9024409651756287" arg: "0.968490481376648" arg: "1.0276471376419067"
+  }
+}
+operand {
+  name: "cell_gate_bias"
+  type: FLOAT32
+  shape { dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "0.027094807475805283" arg: "0.08994408696889877" arg: "0.048134010285139084" arg: "-0.24551978707313538" arg: "0.016918446868658066" arg: "0.0765792727470398" arg: "-0.0031757261604070663" arg: "0.1118675172328949" arg: "-0.0806640088558197" arg: "0.003836719784885645" arg: "-0.02241756208240986" arg: "0.1585727483034134" arg: "0.07568418234586716" arg: "-0.008664635010063648" arg: "-0.0036717928014695644" arg: "-0.036391645669937134" arg: "-0.012257440015673637" arg: "0.05013420805335045" arg: "-0.014501656405627728" arg: "0.22225865721702576"
+  }
+}
+operand {
+  name: "output_gate_bias"
+  type: FLOAT32
+  shape { dim: 20 }
+  filler {
+    tag: "explicit"
+    arg: "0.2127157747745514" arg: "0.3538936972618103" arg: "0.283548504114151" arg: "1.0181398391723633" arg: "0.40145981311798096" arg: "0.27438417077064514" arg: "0.2998640537261963" arg: "0.5031589865684509" arg: "0.0011858611833304167" arg: "0.5359497666358948" arg: "0.5380197763442993" arg: "0.7726592421531677" arg: "0.27104392647743225" arg: "0.4670105576515198" arg: "0.47913044691085815" arg: "0.4600663185119629" arg: "0.3923473060131073" arg: "-0.03211608901619911" arg: "0.6604049205780029" arg: "0.2065485268831253"
+  }
+}
+operand {
+  name: "activation_state"
+  type: FLOAT32
+  shape { dim: 1 dim: 20 }
+  filler {
+    tag: "explicit"
+  }
+}
+operand {
+  name: "cell_state"
+  type: FLOAT32
+  shape { dim: 1 dim: 20 }
+  filler {
+    tag: "explicit"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 28 dim: 20 }
+}
+operation {
+  type: "UnidirectionalSequenceLSTM"
+  unidirectional_sequence_lstm_options {
+    activation: TANH
+    cell_clip: 10.0
+    proj_clip: 0.0
+    time_major: false
+    asymmetric_quantize_inputs: false
+  }
+  input: "ifm"
+  input: "input_to_input_weights"
+  input: "input_to_forget_weights"
+  input: "input_to_cell_weights"
+  input: "input_to_output_weights"
+  input: "recurrent_to_input_weights"
+  input: "recurrent_to_forget_weights"
+  input: "recurrent_to_cell_weights"
+  input: "recurrent_to_output_weights"
+  input: ""
+  input: ""
+  input: ""
+  input: "input_gate_bias"
+  input: "forget_gate_bias"
+  input: "cell_gate_bias"
+  input: "output_gate_bias"
+  input: ""
+  input: ""
+  input: "activation_state"
+  input: "cell_state"
+  input: ""
+  input: ""
+  input: ""
+  input: ""
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_001/test.reverse b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/UnidirectionalSequenceLSTM_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
index 3110b5ed9..887380c48 100644
--- a/res/TensorFlowLiteRecipes/Unique_000/test.recipe
+++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
@@ -6,7 +6,7 @@ operand {
 operand {
   name: "ofm"
   type: FLOAT32
-  shape { dim: 0 }
+  shape { }
 }
 operand {
   name: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
index d654f79b9..9beb51690 100644
--- a/res/TensorFlowLiteRecipes/Unique_001/test.recipe
+++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
@@ -6,7 +6,7 @@ operand {
 operand {
   name: "ofm"
   type: FLOAT32
-  shape { dim: 0 }
+  shape { }
 }
 operand {
   name: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
index d9f2393b8..67b947ff8 100644
--- a/res/TensorFlowLiteRecipes/Unique_002/test.recipe
+++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
@@ -6,7 +6,7 @@ operand {
 operand {
   name: "ofm"
   type: INT32
-  shape { dim: 0 }
+  shape { }
 }
 operand {
   name: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
index de9e87af9..375db66e8 100644
--- a/res/TensorFlowLiteRecipes/Unique_003/test.recipe
+++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
@@ -6,7 +6,7 @@ operand {
 operand {
   name: "ofm"
   type: INT32
-  shape { dim: 0 }
+  shape { }
 }
 operand {
   name: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
index 3906d2c5e..d3985e401 100644
--- a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
+++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
@@ -7,7 +7,7 @@ operand {
 operand {
   name: "ofm"
   type: UINT8
-  shape { dim: 0 }
+  shape { }
 }
 operand {
   name: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
index 2bac10ae7..b08dd85cc 100644
--- a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
+++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
@@ -7,7 +7,7 @@ operand {
 operand {
   name: "ofm"
   type: UINT8
-  shape { dim: 0 }
+  shape { }
 }
 operand {
   name: "ofm_idx"
diff --git a/res/TensorFlowPythonExamples/examples/atrous_conv2d/__init__.py b/res/TensorFlowPythonExamples/examples/atrous_conv2d/__init__.py
new file mode 100644
index 000000000..90756b0b0
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/atrous_conv2d/__init__.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+import numpy as np
+
+in_ = tf.compat.v1.placeholder(tf.float32, shape=(1, 32, 32, 3), name="Hole")
+
+filters = np.random.uniform(low=-1., high=1, size=[5, 5, 3, 32]).astype(np.float32)
+
+op_ = tf.compat.v1.nn.atrous_conv2d(in_, filters, 2, "VALID")
diff --git a/res/TensorFlowPythonExamples/examples/flatten/__init__.py b/res/TensorFlowPythonExamples/examples/flatten/__init__.py
new file mode 100644
index 000000000..bb6dbaa2b
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/flatten/__init__.py
@@ -0,0 +1,5 @@
+import tensorflow as tf
+
+in_ = tf.compat.v1.placeholder(dtype=tf.float32, shape=(3, 3), name="Hole")
+
+op_ = tf.compat.v1.layers.flatten(in_)
diff --git a/res/TensorFlowPythonExamples/examples/instance_norm/__init__.py b/res/TensorFlowPythonExamples/examples/instance_norm/__init__.py
new file mode 100644
index 000000000..b44942c39
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/instance_norm/__init__.py
@@ -0,0 +1,22 @@
+import tensorflow as tf
+
+sess = tf.Session()
+
+in_ = tf.compat.v1.placeholder(dtype=tf.float32, shape=(3, 3), name="Hole")
+norm_ = tf.contrib.layers.instance_norm(in_)
+
+# we need to save checkpoint to freeze dropped model
+init = tf.initialize_all_variables()
+sess.run(init)
+
+saver = tf.train.Saver()
+saver.save(sess, './ckpt/instance_norm.ckpt')
+
+# use below command to freeze this model after running tfpem.py
+'''
+freeze_graph --input_graph instance_norm.pbtxt \
+--input_binary=false \
+--input_checkpoint=./ckpt/instance_norm.ckpt \
+--output_node_names=InstanceNorm/instancenorm/add_1 \
+--output_graph instance_norm_fr.pbtxt
+'''
diff --git a/res/TensorFlowPythonExamples/examples/unidirectional_sequence_LSTM/__init__.py b/res/TensorFlowPythonExamples/examples/unidirectional_sequence_LSTM/__init__.py
new file mode 100644
index 000000000..eaeb32ac3
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/unidirectional_sequence_LSTM/__init__.py
@@ -0,0 +1,4 @@
+import tensorflow as tf
+
+in_ = tf.compat.v1.placeholder(dtype=tf.float32, shape=[28, 28, 3], name="Hole")
+op_ = tf.compat.v1.keras.layers.LSTM(1, time_major=False, return_sequences=True)(in_)
diff --git a/runtime/contrib/android/api/Android.mk b/runtime/contrib/android/api/Android.mk
index a056eff9d..3c768cca5 100644
--- a/runtime/contrib/android/api/Android.mk
+++ b/runtime/contrib/android/api/Android.mk
@@ -4,7 +4,5 @@ include $(CLEAR_VARS)
 API_ROOT_PATH := $(LOCAL_PATH)
 PREBUILT_LIB :=
 
-include $(API_ROOT_PATH)/prebuilt/Android.mk
+include $(API_ROOT_PATH)/Prebuilt.mk
 include $(API_ROOT_PATH)/src/main/native/Android.mk
-
-#$(warning $(PREBUILT_LIB))
diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk
new file mode 100644
index 000000000..7d9f56582
--- /dev/null
+++ b/runtime/contrib/android/api/Prebuilt.mk
@@ -0,0 +1,70 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+ifndef ONERT_PREBUILT_LIB_DIR
+$(error ONERT_PREBUILT_LIB_DIR is not set)
+endif
+
+# libcircle_loader
+include $(CLEAR_VARS)
+LOCAL_MODULE := circle_loader
+PREBUILT_LIB += circle_loader
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libcircle_loader.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libtflite_loader
+include $(CLEAR_VARS)
+LOCAL_MODULE := tflite_loader
+PREBUILT_LIB += tflite_loader
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libtflite_loader.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libtensorflowlite_jni
+include $(CLEAR_VARS)
+LOCAL_MODULE := tensorflowlite_jni
+PREBUILT_LIB += tensorflowlite_jni
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libtensorflowlite_jni.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libnnfw
+include $(CLEAR_VARS)
+LOCAL_MODULE := nnfw-dev
+PREBUILT_LIB += nnfw-dev
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libnnfw-dev.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libonert_core
+include $(CLEAR_VARS)
+LOCAL_MODULE := onert_core
+PREBUILT_LIB += onert_core
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libonert_core.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# backend_cpu
+include $(CLEAR_VARS)
+LOCAL_MODULE := backend_cpu
+PREBUILT_LIB += backend_cpu
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libbackend_cpu.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# TODO Support backend acl
+# backend_acl
+ifeq ($(ONERT_CONTAINS_ACL), 1)
+	$(error containing acl backend doesn't supported yet)
+endif
+
+# backend_ext
+ifneq ($(ONERT_EXT_PREBUILT_LIB), )
+include $(CLEAR_VARS)
+LOCAL_MODULE := backend_ext
+PREBUILT_LIB += backend_ext
+LOCAL_SRC_FILES := \
+		$(ONERT_EXT_PREBUILT_LIB)
+include $(PREBUILT_SHARED_LIBRARY)
+endif
diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle
index def89eeac..afc53d936 100644
--- a/runtime/contrib/android/api/build.gradle
+++ b/runtime/contrib/android/api/build.gradle
@@ -8,11 +8,39 @@ android {
         minSdkVersion 26
         targetSdkVersion 29
         versionCode 1
-        versionName "1.9.0"
+        versionName "1.10.0"
 
         externalNativeBuild {
             ndkBuild {
-                arguments "ONERT_API_INC_DIR=${project.projectDir}/../../../onert/api/include"
+                def onert_header_dir
+                if (project.hasProperty('onertHeaderDir'))
+                    onert_header_dir = project.onertHeaderDir
+                else
+                    onert_header_dir = "${project.projectDir}/../../../onert/api/include"
+
+                def onert_lib_dir
+                if (project.hasProperty('onertLibDir'))
+                    onert_lib_dir = project.onertLibDir
+                else
+                    onert_lib_dir = "${project.projectDir}/../../../../Product/out/lib"
+
+                def onert_contains_acl
+                if (project.hasProperty('onertContainsAcl'))
+                    onert_contains_acl = 1
+                else
+                    onert_contains_acl = 0
+
+                def onert_ext_lib
+                if (project.hasProperty('onertExtLib'))
+                    onert_ext_lib = project.onertExtLib
+                else
+                    onert_ext_lib = ""
+
+                arguments "ONERT_API_INC_DIR=$onert_header_dir",
+                          "ONERT_PREBUILT_LIB_DIR=$onert_lib_dir",
+                          "ONERT_CONTAINS_ACL=$onert_contains_acl",
+                          "ONERT_EXT_PREBUILT_LIB=$onert_ext_lib"
+
                 abiFilters 'arm64-v8a'
             }
         }
diff --git a/runtime/contrib/android/api/prebuilt/Android.mk b/runtime/contrib/android/api/prebuilt/Android.mk
deleted file mode 100644
index e8a9f0755..000000000
--- a/runtime/contrib/android/api/prebuilt/Android.mk
+++ /dev/null
@@ -1,9 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-PREBUILT_PATH := $(LOCAL_PATH)
-include $(PREBUILT_PATH)/backend_cpu/Android.mk
-include $(PREBUILT_PATH)/circle_loader/Android.mk
-include $(PREBUILT_PATH)/nnfw-dev/Android.mk
-include $(PREBUILT_PATH)/onert_core/Android.mk
-include $(PREBUILT_PATH)/tensorflowlite_jni/Android.mk
-include $(PREBUILT_PATH)/tflite_loader/Android.mk
diff --git a/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk b/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk
deleted file mode 100644
index ccda9ea90..000000000
--- a/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := backend_cpu
-PREBUILT_LIB += backend_cpu
-LOCAL_SRC_FILES := \
-		libbackend_cpu.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so b/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so
deleted file mode 120000
index 3d577cf5c..000000000
--- a/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libbackend_cpu.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk b/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk
deleted file mode 100644
index 2e481e93e..000000000
--- a/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := circle_loader
-PREBUILT_LIB += circle_loader
-LOCAL_SRC_FILES := \
-		libcircle_loader.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so b/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so
deleted file mode 120000
index 528d7017f..000000000
--- a/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libcircle_loader.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk b/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk
deleted file mode 100644
index 10cb8f6f4..000000000
--- a/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := nnfw-dev
-PREBUILT_LIB += nnfw-dev
-LOCAL_SRC_FILES := \
-		libnnfw-dev.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so b/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so
deleted file mode 120000
index 1913db8d7..000000000
--- a/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libnnfw-dev.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/onert_core/Android.mk b/runtime/contrib/android/api/prebuilt/onert_core/Android.mk
deleted file mode 100644
index a6682a24f..000000000
--- a/runtime/contrib/android/api/prebuilt/onert_core/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := onert_core
-PREBUILT_LIB += onert_core
-LOCAL_SRC_FILES := \
-		libonert_core.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so b/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so
deleted file mode 120000
index bafe11cb9..000000000
--- a/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libonert_core.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk b/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk
deleted file mode 100644
index 823cf0747..000000000
--- a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := tensorflowlite_jni
-PREBUILT_LIB += tensorflowlite_jni
-LOCAL_SRC_FILES := \
-		libtensorflowlite_jni.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so b/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so
deleted file mode 120000
index d3d72a5a7..000000000
--- a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libtensorflowlite_jni.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk b/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk
deleted file mode 100644
index 135ac1dad..000000000
--- a/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := tflite_loader
-PREBUILT_LIB += tflite_loader
-LOCAL_SRC_FILES := \
-		libtflite_loader.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so b/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so
deleted file mode 120000
index 4c001aec0..000000000
--- a/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libtflite_loader.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
index 1644e0f7f..209264d31 100644
--- a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
+++ b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
@@ -121,8 +121,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
 
   if (jni::setInput(handle, params) == false)
   {
-    __android_log_print(ANDROID_LOG_ERROR, JTAG, "%s] failed native setOutput",
-                        __PRETTY_FUNCTION__);
+    __android_log_print(ANDROID_LOG_ERROR, JTAG, "%s] failed native setInput", __PRETTY_FUNCTION__);
     return JNI_FALSE;
   }
 
diff --git a/runtime/contrib/android_benchmark_app/CMakeLists.txt b/runtime/contrib/android_benchmark_app/CMakeLists.txt
index 55dbf0024..beb279cb9 100644
--- a/runtime/contrib/android_benchmark_app/CMakeLists.txt
+++ b/runtime/contrib/android_benchmark_app/CMakeLists.txt
@@ -55,7 +55,7 @@ target_link_libraries(android_benchmark_native nnfw_lib_tflite)
 target_link_libraries(android_benchmark_native nnfw_lib_misc)
 target_link_libraries(android_benchmark_native log)
 
-nnas_find_package(FlatBuffersSource EXACT 1.11 REQUIRED)
+nnas_find_package(FlatBuffersSource EXACT 1.12 REQUIRED)
 target_include_directories(android_benchmark_native PUBLIC ${FlatBuffersSource_DIR}/include .)
 
 add_custom_target(android-benchmark-apk ALL
diff --git a/runtime/libs/ndarray/src/ContiguousSpan.cpp b/runtime/libs/benchmark/include/benchmark/MemoryInfo.h
index e06cfc2a1..6e8e12ba4 100644
--- a/runtime/libs/ndarray/src/ContiguousSpan.cpp
+++ b/runtime/libs/benchmark/include/benchmark/MemoryInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,27 @@
  * limitations under the License.
  */
 
-#include "ndarray/ContiguousSpan.h"
+#ifndef __NNFW_BENCHMARK_MEMORY_INFO_H__
+#define __NNFW_BENCHMARK_MEMORY_INFO_H__
 
-namespace ndarray
+#include <cstdint>
+#include <string>
+
+namespace benchmark
 {
 
-template class ContiguousSpan<float, true>;
-template class ContiguousSpan<float, false>;
-template class ContiguousSpan<int32_t, true>;
-template class ContiguousSpan<int32_t, false>;
-template class ContiguousSpan<uint32_t, true>;
-template class ContiguousSpan<uint32_t, false>;
-template class ContiguousSpan<uint8_t, true>;
-template class ContiguousSpan<uint8_t, false>;
+bool prepareVmRSS();
+bool prepareVmHWM();
+bool prepareGpuMemory();
+bool preparePssSum();
+
+uint32_t getVmRSS();
+uint32_t getVmHWM();
+uint32_t getGpuMemory(const std::string &process_name);
+uint32_t getPssSum();
+
+std::string getProcessName();
+
+} // namespace benchmark
 
-} // namespace ndarray
+#endif // __NNFW_BENCHMARK_MEMORY_INFO_H__
diff --git a/runtime/libs/benchmark/include/benchmark/MemoryPoller.h b/runtime/libs/benchmark/include/benchmark/MemoryPoller.h
index 48caa3b3a..47db3fd77 100644
--- a/runtime/libs/benchmark/include/benchmark/MemoryPoller.h
+++ b/runtime/libs/benchmark/include/benchmark/MemoryPoller.h
@@ -57,10 +57,6 @@ public:
 private:
   void process();
   bool prepareMemoryPolling();
-  uint32_t getVmRSS();
-  uint32_t getVmHWM();
-  uint32_t getGpuMemory();
-  uint32_t getPssSum();
 
 private:
   std::chrono::milliseconds _duration;
diff --git a/runtime/libs/benchmark/include/benchmark/Phases.h b/runtime/libs/benchmark/include/benchmark/Phases.h
index 936a89742..7d642782a 100644
--- a/runtime/libs/benchmark/include/benchmark/Phases.h
+++ b/runtime/libs/benchmark/include/benchmark/Phases.h
@@ -50,6 +50,9 @@ public:
   const MemoryPoller &mem_poll() const { return *_mem_poll; }
   const Phase &at(const std::string &tag) const { return _phases.at(tag); }
 
+  uint32_t mem_before_init() const { return _mem_before_init; }
+  uint32_t mem_after_run() const { return _mem_after_run; }
+
 private:
   void run(const std::string &tag, const PhaseFunc &exec, const PhaseFunc *post, uint32_t loop_num,
            bool option_disable);
@@ -58,6 +61,8 @@ private:
   const PhaseOption _option;
   std::unordered_map<std::string, Phase> _phases;
   std::unique_ptr<MemoryPoller> _mem_poll;
+  uint32_t _mem_before_init;
+  uint32_t _mem_after_run;
 };
 
 } // namespace benchmark
diff --git a/runtime/libs/benchmark/include/benchmark/Result.h b/runtime/libs/benchmark/include/benchmark/Result.h
index 69084b300..7604aa904 100644
--- a/runtime/libs/benchmark/include/benchmark/Result.h
+++ b/runtime/libs/benchmark/include/benchmark/Result.h
@@ -34,6 +34,8 @@ public:
   double time[PhaseEnum::END_OF_PHASE][FigureType::END_OF_FIG_TYPE];
   uint32_t memory[PhaseEnum::END_OF_PHASE][MemoryType::END_OF_MEM_TYPE];
   bool print_memory = false;
+  uint32_t init_memory = 0;
+  uint32_t peak_memory = 0;
 };
 
 // TODO Support not only stdout but also ostream
diff --git a/runtime/libs/benchmark/src/MemoryInfo.cpp b/runtime/libs/benchmark/src/MemoryInfo.cpp
new file mode 100644
index 000000000..20d262961
--- /dev/null
+++ b/runtime/libs/benchmark/src/MemoryInfo.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark/MemoryInfo.h"
+
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+#include <cassert>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+namespace
+{
+
+const std::string proc_status_path("/proc/self/status");
+const std::string gpu_memory_path("/sys/kernel/debug/mali0/gpu_memory");
+const std::string proc_smaps_path("/proc/self/smaps");
+
+bool isStrNumber(const std::string &s)
+{
+  return !s.empty() &&
+         std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end();
+}
+
+std::vector<std::string> splitLine(std::string line, std::string delimiters = " \n\t")
+{
+  std::vector<std::string> words;
+  size_t prev = 0, pos;
+
+  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos)
+  {
+    if (pos > prev)
+      words.emplace_back(line.substr(prev, pos - prev));
+    prev = pos + 1;
+  }
+
+  if (prev < line.length())
+    words.emplace_back(line.substr(prev, std::string::npos));
+
+  return words;
+}
+
+std::vector<std::string> getValueFromFileStatus(const std::string &file, const std::string &key)
+{
+  std::ifstream ifs(file);
+  assert(ifs.is_open());
+
+  std::string line;
+  std::vector<std::string> val;
+
+  bool found = false;
+  while (std::getline(ifs, line))
+  {
+    if (line.find(key) != std::string::npos)
+    {
+      found = true;
+      break;
+    }
+  }
+  ifs.close();
+
+  if (!found)
+  {
+    // NOTE. the process which uses gpu resources cannot be there yet at the model-load phase.
+    // At that time, just return empty.
+    return val;
+  }
+
+  val = splitLine(line);
+  return val;
+}
+
+// Because of smaps' structure, returns sum value as uint32_t
+uint32_t getSumValueFromFileSmaps(const std::string &file, const std::string &key)
+{
+  std::ifstream ifs(file);
+  assert(ifs.is_open());
+
+  std::string line;
+  uint32_t sum = 0;
+  while (std::getline(ifs, line))
+  {
+    if (line.find(key) != std::string::npos)
+    {
+      // an example by splitLine()
+      // `Pss:                   0 kB`
+      // val[0]: "Pss:", val[1]: "0" val[2]: "kB"
+      auto val = splitLine(line);
+      assert(val.size() != 0);
+      // SwapPss could show so that check where Pss is at the beginning
+      if (val[0].find("Pss") != 0)
+      {
+        continue;
+      }
+      sum += std::stoul(val[1]);
+    }
+  }
+
+  return sum;
+}
+
+} // namespace
+
+namespace benchmark
+{
+
+bool prepareVmRSS() { return std::ifstream(proc_status_path).is_open(); }
+
+bool prepareVmHWM() { return std::ifstream(proc_status_path).is_open(); }
+
+bool prepareGpuMemory() { return std::ifstream(gpu_memory_path).is_open(); }
+
+bool preparePssSum() { return std::ifstream(proc_smaps_path).is_open(); }
+
+uint32_t getVmRSS()
+{
+  auto val = getValueFromFileStatus(proc_status_path, "VmRSS");
+  if (val.size() == 0)
+    return 0;
+  assert(isStrNumber(val[1]));
+  return std::stoul(val[1]);
+}
+
+uint32_t getVmHWM()
+{
+  auto val = getValueFromFileStatus(proc_status_path, "VmHWM");
+  if (val.size() == 0)
+    return 0;
+  // key: value
+  assert(isStrNumber(val[1]));
+  return std::stoul(val[1]);
+}
+
+uint32_t getGpuMemory(const std::string &process_name)
+{
+  assert(!process_name.empty());
+  auto val = getValueFromFileStatus(gpu_memory_path, process_name);
+  if (val.size() == 0)
+    return 0;
+  // process_name -> pid -> gpu_mem -> max_gpu_mem
+  assert(isStrNumber(val[2]));
+  return std::stoul(val[2]);
+}
+
+uint32_t getPssSum() { return getSumValueFromFileSmaps(proc_smaps_path, "Pss"); }
+
+std::string getProcessName()
+{
+  auto val = getValueFromFileStatus(proc_status_path, "Name");
+  assert(val.size() >= 2);
+  return val[1];
+}
+
+} // namespace benchmark
diff --git a/runtime/libs/benchmark/src/MemoryPoller.cpp b/runtime/libs/benchmark/src/MemoryPoller.cpp
index 61fdecd46..050b5b163 100644
--- a/runtime/libs/benchmark/src/MemoryPoller.cpp
+++ b/runtime/libs/benchmark/src/MemoryPoller.cpp
@@ -16,106 +16,13 @@
 
 #include "benchmark/MemoryPoller.h"
 #include "benchmark/Types.h"
+#include "benchmark/MemoryInfo.h"
 
 #include <vector>
-#include <fstream>
-#include <sstream>
 #include <stdexcept>
 #include <cassert>
 #include <iostream>
 
-namespace
-{
-
-const std::string proc_status_path("/proc/self/status");
-const std::string gpu_memory_path("/sys/kernel/debug/mali0/gpu_memory");
-const std::string proc_smaps_path("/proc/self/smaps");
-
-bool isStrNumber(const std::string &s)
-{
-  return !s.empty() &&
-         std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end();
-}
-
-std::vector<std::string> splitLine(std::string line, std::string delimiters = " \n\t")
-{
-  std::vector<std::string> words;
-  size_t prev = 0, pos;
-
-  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos)
-  {
-    if (pos > prev)
-      words.emplace_back(line.substr(prev, pos - prev));
-    prev = pos + 1;
-  }
-
-  if (prev < line.length())
-    words.emplace_back(line.substr(prev, std::string::npos));
-
-  return words;
-}
-
-std::vector<std::string> getValueFromFileStatus(const std::string &file, const std::string &key)
-{
-  std::ifstream ifs(file);
-  assert(ifs.is_open());
-
-  std::string line;
-  std::vector<std::string> val;
-
-  bool found = false;
-  while (std::getline(ifs, line))
-  {
-    if (line.find(key) != std::string::npos)
-    {
-      found = true;
-      break;
-    }
-  }
-  ifs.close();
-
-  if (!found)
-  {
-    // NOTE. the process which uses gpu resources cannot be there yet at the model-load phase.
-    // At that time, just return empty.
-    return val;
-  }
-
-  val = splitLine(line);
-  return val;
-}
-
-// Because of smaps' structure, returns sum value as uint32_t
-uint32_t getSumValueFromFileSmaps(const std::string &file, const std::string &key)
-{
-  std::ifstream ifs(file);
-  assert(ifs.is_open());
-
-  std::string line;
-  uint32_t sum = 0;
-  while (std::getline(ifs, line))
-  {
-    if (line.find(key) != std::string::npos)
-    {
-      // an example by splitLine()
-      // `Pss:                   0 kB`
-      // val[0]: "Pss:", val[1]: "0" val[2]: "kB"
-      auto val = splitLine(line);
-      assert(val.size() != 0);
-      // SwapPss could show so that check where Pss is at the beginning
-      if (val[0].find("Pss") != 0)
-      {
-        continue;
-      }
-      sum += std::stoul(val[1]);
-    }
-  }
-
-  return sum;
-}
-
-} // namespace
-
 namespace benchmark
 {
 
@@ -168,7 +75,7 @@ bool MemoryPoller::end(PhaseEnum phase)
   mem = getVmRSS();
   if (_gpu_poll)
   {
-    mem += getGpuMemory();
+    mem += getGpuMemory(_process_name);
   }
   if (mem > _rss_map[phase])
     _rss_map[phase] = mem;
@@ -176,7 +83,7 @@ bool MemoryPoller::end(PhaseEnum phase)
   mem = getVmHWM();
   if (_gpu_poll)
   {
-    mem += getGpuMemory();
+    mem += getGpuMemory(_process_name);
   }
   _hwm_map[phase] = mem;
 
@@ -208,7 +115,7 @@ void MemoryPoller::process()
     uint32_t cur_hwm = getVmHWM();
     if (_gpu_poll)
     {
-      auto gpu_mem = getGpuMemory();
+      auto gpu_mem = getGpuMemory(_process_name);
       cur_rss += gpu_mem;
       cur_hwm += gpu_mem;
     }
@@ -236,77 +143,33 @@ void MemoryPoller::process()
 bool MemoryPoller::prepareMemoryPolling()
 {
   // VmRSS
+  if (!prepareVmRSS())
   {
-    std::ifstream ifs(proc_status_path);
-    if (!ifs.is_open())
-    {
-      std::cerr << "failed to open " << proc_status_path << std::endl;
-      return false;
-    }
-    ifs.close();
+    std::cerr << "failed to prepare parsing vmrss" << std::endl;
+    return false;
   }
 
   // (Additionally) GpuMemory
   if (_gpu_poll)
   {
-    std::ifstream ifs(gpu_memory_path);
-    if (!ifs.is_open())
+    if (!prepareGpuMemory())
     {
-      std::cerr << "failed to open " << gpu_memory_path << std::endl;
+      std::cerr << "failed to prepare parsing gpu memory" << std::endl;
       return false;
     }
-    ifs.close();
 
     // Needs process name
-    auto val = getValueFromFileStatus(proc_status_path, "Name");
-    assert(val.size() != 0);
-    _process_name = val[1];
+    _process_name = getProcessName();
   }
 
   // PSS
+  if (!preparePssSum())
   {
-    std::ifstream ifs(proc_smaps_path);
-    if (!ifs.is_open())
-    {
-      std::cerr << "failed to open " << proc_smaps_path << std::endl;
-      return false;
-    }
-    ifs.close();
+    std::cerr << "failed to prepare parsing pss sum" << std::endl;
+    return false;
   }
 
   return true;
 }
 
-uint32_t MemoryPoller::getVmRSS()
-{
-  auto val = getValueFromFileStatus(proc_status_path, "VmRSS");
-  if (val.size() == 0)
-    return 0;
-  assert(isStrNumber(val[1]));
-  return std::stoul(val[1]);
-}
-
-uint32_t MemoryPoller::getVmHWM()
-{
-  auto val = getValueFromFileStatus(proc_status_path, "VmHWM");
-  if (val.size() == 0)
-    return 0;
-  // key: value
-  assert(isStrNumber(val[1]));
-  return std::stoul(val[1]);
-}
-
-uint32_t MemoryPoller::getGpuMemory()
-{
-  assert(!_process_name.empty());
-  auto val = getValueFromFileStatus(gpu_memory_path, _process_name);
-  if (val.size() == 0)
-    return 0;
-  // process_name -> pid -> gpu_mem -> max_gpu_mem
-  assert(isStrNumber(val[2]));
-  return std::stoul(val[2]);
-}
-
-uint32_t MemoryPoller::getPssSum() { return getSumValueFromFileSmaps(proc_smaps_path, "Pss"); }
-
 } // namespace benchmark
diff --git a/runtime/libs/benchmark/src/Phases.cpp b/runtime/libs/benchmark/src/Phases.cpp
index 9ab67cfd9..897b943d3 100644
--- a/runtime/libs/benchmark/src/Phases.cpp
+++ b/runtime/libs/benchmark/src/Phases.cpp
@@ -17,6 +17,7 @@
 
 #include "benchmark/Phases.h"
 #include "benchmark/Types.h"
+#include "benchmark/MemoryInfo.h"
 
 #include <cassert>
 #include <chrono>
@@ -46,8 +47,11 @@ void SleepForMicros(uint64_t micros)
 namespace benchmark
 {
 
-Phases::Phases(const PhaseOption &option) : _option(option)
+Phases::Phases(const PhaseOption &option) : _option(option), _mem_before_init(0), _mem_after_run(0)
 {
+  assert(prepareVmRSS());
+  _mem_before_init = getVmHWM();
+
   if (_option.memory)
   {
     _mem_poll = std::make_unique<MemoryPoller>(std::chrono::milliseconds(option.memory_interval),
@@ -93,6 +97,8 @@ void Phases::run(const std::string &tag, const PhaseFunc &exec, const PhaseFunc
     }
   }
 
+  _mem_after_run = getVmHWM();
+
   if (p == PhaseEnum::END_OF_PHASE)
   {
     return;
diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
index df573da92..e6cafb91c 100644
--- a/runtime/libs/benchmark/src/Result.cpp
+++ b/runtime/libs/benchmark/src/Result.cpp
@@ -141,6 +141,15 @@ void printResultMemory(const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE]
   }
 }
 
+void printUsedPeakMemory(uint32_t init_memory, uint32_t peak_memory)
+{
+  uint32_t used_peak_memory = peak_memory - init_memory;
+  std::cout << "Used Peak Memory : " << used_peak_memory << " kb" << std::endl;
+  std::cout << "- HWM after run  : " << peak_memory << " kb" << std::endl;
+  std::cout << "- HWM before init: " << init_memory << " kb" << std::endl;
+  std::cout << "===================================" << std::endl;
+}
+
 } // namespace
 
 namespace benchmark
@@ -175,6 +184,8 @@ Result::Result(const Phases &phases)
       }
     }
   }
+  init_memory = phases.mem_before_init();
+  peak_memory = phases.mem_after_run();
 }
 
 void printResult(const Result &result)
@@ -185,6 +196,7 @@ void printResult(const Result &result)
     return;
 
   printResultMemory(result.memory);
+  printUsedPeakMemory(result.init_memory, result.peak_memory);
 }
 
 // TODO There are necessary for a kind of output data file so that it doesn't have to be csv file
diff --git a/runtime/libs/misc/include/misc/polymorphic_downcast.h b/runtime/libs/misc/include/misc/polymorphic_downcast.h
index 412b864e6..ee885eb70 100644
--- a/runtime/libs/misc/include/misc/polymorphic_downcast.h
+++ b/runtime/libs/misc/include/misc/polymorphic_downcast.h
@@ -27,9 +27,7 @@ namespace misc
 
 template <typename DstType, typename SrcType> inline DstType polymorphic_downcast(SrcType *x)
 {
-#ifndef __ANDROID__
   assert(dynamic_cast<DstType>(x) == x);
-#endif
   return static_cast<DstType>(x);
 }
 
diff --git a/runtime/libs/ndarray/CMakeLists.txt b/runtime/libs/ndarray/CMakeLists.txt
deleted file mode 100644
index b040f5115..000000000
--- a/runtime/libs/ndarray/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-add_library(ndarray STATIC src/Array.cpp src/ContiguousSpan.cpp)
-
-set_target_properties(ndarray PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-target_include_directories(ndarray PUBLIC include)
-#can't make this private because of c++ templates
-target_include_directories(ndarray PUBLIC src)
-
-option(NDARRAY_INLINE_TEMPLATES "Set to ON to disable extern declarations for common types")
-
-if(${NDARRAY_INLINE_TEMPLATES})
-    target_compile_definitions(ndarray PUBLIC -DNDARRAY_INLINE_TEMPLATES=1)
-endif()
-
-target_link_libraries(ndarray PRIVATE nnfw_common)
-target_link_libraries(ndarray PRIVATE nnfw_coverage)
-
-add_subdirectory(test)
-add_subdirectory(example)
diff --git a/runtime/libs/ndarray/example/CMakeLists.txt b/runtime/libs/ndarray/example/CMakeLists.txt
deleted file mode 100644
index c4b575dad..000000000
--- a/runtime/libs/ndarray/example/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_executable(example_no_array example_no_array.cpp)
-
-add_executable(example_array example_array.cpp)
-target_link_libraries(example_array PRIVATE ndarray)
diff --git a/runtime/libs/ndarray/example/example_array.cpp b/runtime/libs/ndarray/example/example_array.cpp
deleted file mode 100644
index 85d274681..000000000
--- a/runtime/libs/ndarray/example/example_array.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ndarray/Array.h"
-
-#include <iostream>
-#include <iterator>
-
-using namespace ndarray;
-
-void gather_array(const Array<float> &input, Array<float> &output, const Array<int> &indices)
-{
-  assert(indices.shape().rank() == 3);
-  assert(input.shape().rank() == 3);
-  assert(indices.shape().dim(1) == input.shape().rank());
-
-  for (size_t i = 0; i < indices.shape().dim(0); ++i)
-  {
-    for (size_t j = 0; j < indices.shape().dim(1); ++j)
-    {
-      auto index = indices.slice(i, j);
-      output.slice(i, j).assign(input.slice(index[0], index[1]));
-    }
-  }
-}
-
-int main()
-{
-  // fill tensor of shape[3,3,4] with sequential numbers from [0..36)
-  Shape in_shape{3, 3, 4};
-  std::vector<float> input_data(in_shape.element_count());
-  for (size_t i = 0; i < in_shape.element_count(); ++i)
-    input_data[i] = i;
-
-  Array<float> input(input_data.data(), in_shape);
-
-  // select column-vectors on main diagonal
-  Shape indices_shape{1, 3, 2};
-  std::vector<int> indices_data(indices_shape.element_count());
-  Array<int> indices(indices_data.data(), indices_shape);
-
-  indices.slice(0, 0) = {0, 0};
-  indices.slice(0, 1) = {1, 1};
-  indices.slice(0, 2) = {2, 2};
-
-  Shape output_shape{1, 3, 4};
-  std::vector<float> output_data(output_shape.element_count());
-
-  Array<float> output(output_data.data(), output_shape);
-
-  gather_array(input, output, indices);
-
-  for (size_t i = 0; i < indices_shape.dim(0); ++i)
-  {
-    for (size_t j = 0; j < indices_shape.dim(1); ++j)
-    {
-      auto output_piece = output.slice(i, j);
-      std::ostream_iterator<int> cout_it(std::cout, ", ");
-      std::copy(output_piece.begin(), output_piece.end(), cout_it);
-      std::cout << std::endl;
-    }
-  }
-}
diff --git a/runtime/libs/ndarray/example/example_no_array.cpp b/runtime/libs/ndarray/example/example_no_array.cpp
deleted file mode 100644
index 3a4d05dca..000000000
--- a/runtime/libs/ndarray/example/example_no_array.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <array>
-#include <vector>
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-
-void gather_no_array(const float *in_data, const std::array<size_t, 3> &dims, float *out_data,
-                     const std::array<size_t, 3> &out_dims, //[nselections,
-                     const int *indices, const std::array<size_t, 3> &indices_dims)
-{
-  assert(indices_dims[1] == dims.size());
-
-  for (int i = 0; i < indices_dims[0]; ++i)
-  {
-    for (int j = 0; j < indices_dims[1]; ++j)
-    {
-      const int *index_ptr = indices + i * indices_dims[2] * indices_dims[1] + j * indices_dims[2];
-
-      size_t in_offset = index_ptr[0] * dims[2] * dims[1] + index_ptr[1] * dims[2];
-
-      const float *in_ptr = in_data + in_offset;
-
-      size_t out_offset = i * out_dims[2] * out_dims[1] + j * out_dims[2];
-
-      float *out_ptr = out_data + out_offset;
-
-      for (int k = 0; k < dims[2]; ++k)
-      {
-        out_ptr[k] = in_ptr[k];
-      }
-    }
-  }
-}
-
-int main()
-{
-  std::array<size_t, 3> in_dims{3, 3, 4};
-  std::vector<float> input(3 * 3 * 4);
-  for (size_t i = 0; i < 3 * 3 * 4; ++i)
-    input[i] = i;
-
-  std::array<size_t, 3> indices_shape{1, 3, 2};
-  std::vector<int> indices(1 * 3 * 2);
-
-  indices[0] = 0;
-  indices[1] = 0;
-  indices[2] = 1;
-  indices[3] = 1;
-  indices[4] = 2;
-  indices[5] = 2;
-
-  std::array<size_t, 3> output_dims{1, 3, 4};
-  std::vector<float> output(1 * 3 * 4);
-
-  gather_no_array(input.data(), in_dims, output.data(), output_dims, indices.data(), indices_shape);
-
-  for (size_t i = 0; i < output_dims[0]; ++i)
-  {
-    for (size_t j = 0; j < output_dims[1]; ++j)
-    {
-      auto out_ptr = output.data() + i * output_dims[1] * output_dims[2] + j * output_dims[2];
-      for (size_t k = 0; k < output_dims[2]; ++k)
-      {
-        std::cout << out_ptr[k] << ", ";
-      }
-      std::cout << std::endl;
-    }
-  }
-}
diff --git a/runtime/libs/ndarray/include/ndarray/Array.h b/runtime/libs/ndarray/include/ndarray/Array.h
deleted file mode 100644
index 3890cc26b..000000000
--- a/runtime/libs/ndarray/include/ndarray/Array.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_ARRAY_H_
-#define _NDARRAY_ARRAY_H_
-
-#include "Common.h"
-
-#include "ContiguousSpan.h"
-#include "Shape.h"
-
-#if __cplusplus < 201402L
-#include "detail/cxx14.h" //integer_sequence and make_index_dequence definitions
-#else
-#include <utility>
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <type_traits>
-#include <array>
-#include <tuple>
-#include <cstddef>
-
-namespace ndarray
-{
-
-// there is no index_sequence before c++14
-#if __cplusplus < 201402L
-
-template <size_t... Nums> using index_sequence = cxx14::index_sequence<Nums...>;
-
-template <size_t Num> using make_index_sequence = cxx14::make_index_sequence<Num>;
-
-#else
-
-template <size_t... Nums> using index_sequence = std::index_sequence<Nums...>;
-
-template <size_t _Num> using make_index_sequence = std::make_index_sequence<_Num>;
-
-#endif //__cplusplus < 201402L
-
-struct Strides
-{
-  explicit Strides(Shape s) : _strides{} { fillStrides(s); }
-
-  int operator[](size_t idx) const noexcept { return _strides[idx]; }
-
-  // since we don't have c++14 fold expression
-  template <typename Seq, typename... Ts> struct _calc_offset;
-
-  template <size_t Num, size_t... Nums, typename T, typename... Ts>
-  struct _calc_offset<index_sequence<Num, Nums...>, T, Ts...>
-  {
-    static constexpr size_t get(const std::array<int, 8> &strides, int x, Ts... xs)
-    {
-      return _calc_offset<index_sequence<Nums...>, Ts...>::get(strides, xs...) +
-             x * std::get<Num>(strides);
-    }
-  };
-
-  template <size_t Num, typename T> struct _calc_offset<index_sequence<Num>, T>
-  {
-    static constexpr size_t get(const std::array<int, 8> &strides, int x)
-    {
-      return x * std::get<Num>(strides);
-    }
-  };
-
-  template <typename Seq, typename... Ts> constexpr size_t offset(Seq, Ts... x) const noexcept
-  {
-    // return ( 0 + ... + (std::get<Nums>(_strides) * x)); in c++14
-    return _calc_offset<Seq, Ts...>::get(_strides, x...);
-  }
-
-private:
-  void fillStrides(const Shape &s) noexcept
-  {
-    int rank = s.rank();
-    _strides[rank - 1] = 1;
-    for (int d = rank - 2; d >= 0; --d)
-    {
-      _strides[d] = _strides[d + 1] * s.dim(d + 1);
-    }
-  }
-
-  std::array<int, NDARRAY_MAX_DIMENSION_COUNT> _strides;
-};
-
-template <typename T> class Array
-{
-public:
-  Array(T *data, Shape shape) noexcept : _data(data), _shape(shape), _strides(shape) {}
-
-  Array(const Array &) = delete;
-
-  Array(Array &&a) noexcept : _data(a._data), _shape(a._shape), _strides(a._strides)
-  {
-    a._data = nullptr;
-  }
-
-  template <typename... Ts> T &at(Ts... x) const noexcept { return _at(static_cast<size_t>(x)...); }
-
-  /**
-   * @brief returns last dimension as ContigniousSpan
-   * @param x indices of slice to take. See tests for usage details
-   * @return slice at given position
-   */
-  template <typename... Ts> ContiguousSpan<T, std::is_const<T>::value> slice(Ts... x) noexcept
-  {
-    assert(sizeof...(Ts) == _shape.rank() - 1);
-    return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)};
-  }
-
-  /**
-   * @brief returns last dimension as ContigniousSpan
-   * @param x indices of slice to take. See tests for usage details
-   * @return slice at given position
-   */
-  template <typename... Ts> ContiguousSpan<T, true> slice(Ts... x) const noexcept
-  {
-    assert(sizeof...(Ts) == _shape.rank() - 1);
-    return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)};
-  }
-
-  ContiguousSpan<T, std::is_const<T>::value> flat() noexcept
-  {
-    return {_data, _shape.element_count()};
-  }
-
-  ContiguousSpan<T, true> flat() const noexcept { return {_data, _shape.element_count()}; }
-
-  const Shape &shape() const noexcept { return _shape; }
-
-private:
-  template <typename... Ts> T &_at(Ts... x) const noexcept
-  {
-    assert(sizeof...(x) == _shape.rank());
-    using Indices = make_index_sequence<sizeof...(Ts)>;
-    return _data[offset(Indices{}, x...)];
-  }
-
-  template <typename... Ts, size_t... Nums>
-  size_t offset(index_sequence<Nums...> seq, Ts... x) const noexcept
-  {
-    static_assert(
-        sizeof...(Ts) == sizeof...(Nums),
-        "Sanity check failed. Generated index sequence size is not equal to argument count");
-
-    return _strides.offset(seq, x...);
-  }
-
-  T *_data;
-  Shape _shape;
-  Strides _strides;
-};
-
-template <typename To, typename From> Array<To> array_cast(Array<From> &&from, Shape newShape)
-{
-  assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count());
-  return Array<To>(reinterpret_cast<To *>(from.flat().data()), newShape);
-}
-
-template <typename To, typename From>
-Array<const To> array_cast(const Array<From> &from, Shape newShape)
-{
-  assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count());
-  return Array<To>(reinterpret_cast<const To *>(from.flat().data()), newShape);
-}
-
-#ifndef NDARRAY_INLINE_TEMPLATES
-
-extern template class Array<float>;
-extern template class Array<int32_t>;
-extern template class Array<uint32_t>;
-extern template class Array<uint8_t>;
-
-#endif // NDARRAY_INLINE_TEMPLATES
-
-} // namespace ndarray
-
-#endif //_NDARRAY_ARRAY_H_
diff --git a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h b/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h
deleted file mode 100644
index 8caa6a686..000000000
--- a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_CONTIGNIOUS_SPAN_H_
-#define _NDARRAY_CONTIGNIOUS_SPAN_H_
-
-#include <type_traits>
-#include <vector>
-#include <cstdint>
-#include <cstddef>
-#include <cassert>
-
-namespace ndarray
-{
-
-template <typename T, bool isConst = false> class ContiguousSpan
-{
-public:
-  using pointer_type = typename std::conditional<isConst, const T *, T *>::type;
-  using reference_type = typename std::conditional<isConst, const T &, T &>::type;
-  using iterator_type = pointer_type;
-
-  ContiguousSpan(pointer_type data, size_t len) noexcept : _data(data), _len(len) {}
-
-  template <typename It>
-  explicit ContiguousSpan(It first, It last) noexcept
-      : _data(&*first), _len(std::distance(first, last))
-  {
-  }
-
-  ContiguousSpan(const ContiguousSpan &) = delete;
-
-  ContiguousSpan(ContiguousSpan &&s) noexcept : _data(s._data), _len(s._len) { s._data = nullptr; }
-
-  operator ContiguousSpan<T, true>() { return ContiguousSpan<T, true>{_data, _len}; }
-
-  reference_type operator[](size_t idx) const noexcept { return _data[idx]; }
-
-  reference_type at(size_t idx) const noexcept { return _data[idx]; }
-
-  ContiguousSpan<T, isConst> offset(size_t offset)
-  {
-    assert(offset <= _len);
-    return {_data + offset, _len - offset};
-  }
-
-  template <typename From, bool _ = isConst>
-  typename std::enable_if<!_, void>::type assign(const From &f) noexcept
-  {
-    assignFrom(std::begin(f), std::end(f));
-  }
-
-  template <typename U, bool _ = isConst>
-  typename std::enable_if<!_, ContiguousSpan &>::type
-  operator=(std::initializer_list<U> list) noexcept
-  {
-    assignFrom(std::begin(list), std::end(list));
-    return *this;
-  }
-
-  template <typename It, bool _ = isConst>
-  typename std::enable_if<!_, void>::type assignFrom(It first, It last) noexcept
-  {
-    std::copy(first, last, begin());
-  }
-
-  size_t size() const { return _len; }
-
-  iterator_type begin() const { return iterator_type{_data}; }
-
-  iterator_type end() const { return iterator_type{_data + _len}; }
-
-  pointer_type data() { return _data; }
-
-private:
-  pointer_type _data;
-  size_t _len;
-};
-
-#ifndef NDARRAY_INLINE_TEMPLATES
-
-extern template class ContiguousSpan<float, true>;
-extern template class ContiguousSpan<float, false>;
-extern template class ContiguousSpan<int32_t, true>;
-extern template class ContiguousSpan<int32_t, false>;
-extern template class ContiguousSpan<uint32_t, true>;
-extern template class ContiguousSpan<uint32_t, false>;
-extern template class ContiguousSpan<uint8_t, true>;
-extern template class ContiguousSpan<uint8_t, false>;
-
-#endif // NDARRAY_INLINE_TEMPLATES
-
-} // namespace ndarray
-
-#endif //_NDARRAY_CONTIGNIOUS_SPAN_H_
diff --git a/runtime/libs/ndarray/include/ndarray/Shape.h b/runtime/libs/ndarray/include/ndarray/Shape.h
deleted file mode 100644
index fa58613b8..000000000
--- a/runtime/libs/ndarray/include/ndarray/Shape.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_SHAPE_H_
-#define _NDARRAY_SHAPE_H_
-
-#include "Common.h"
-
-#include <array>
-#include <cassert>
-#include <cstddef>
-
-namespace ndarray
-{
-
-class Shape
-{
-public:
-  //_dims{} here and later since array does not have std::initializer_list ctor
-  // and aggregate initialization is not allowed here
-  explicit Shape(size_t rank) noexcept : _dims{}, _rank(rank)
-  {
-    std::fill(_dims.begin(), _dims.end(), 0);
-  }
-
-  Shape(std::initializer_list<size_t> list) noexcept : _dims{}, _rank(list.size())
-  {
-    std::copy(list.begin(), list.end(), _dims.begin());
-  }
-
-  size_t dim(int i) const noexcept { return _dims.at(i); }
-
-  size_t &dim(int i) noexcept { return _dims.at(i); }
-
-  size_t element_count() const noexcept
-  {
-    uint32_t res = 1;
-    for (size_t i = 0; i < rank(); ++i)
-      res *= dim(i);
-    assert(res <= 0xffffffff);
-    return res;
-  }
-
-  size_t rank() const noexcept { return _rank; }
-
-private:
-  std::array<size_t, NDARRAY_MAX_DIMENSION_COUNT> _dims;
-  size_t _rank;
-};
-
-} // namespace ndarray
-
-#endif //_NDARRAY_SHAPE_H_
diff --git a/runtime/libs/ndarray/src/detail/cxx14.h b/runtime/libs/ndarray/src/detail/cxx14.h
deleted file mode 100644
index 81135b3f2..000000000
--- a/runtime/libs/ndarray/src/detail/cxx14.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_CXX14_H_
-#define _NDARRAY_CXX14_H_
-
-namespace ndarray
-{
-
-namespace cxx14
-{
-
-template <size_t... Nums> struct index_sequence
-{
-  using value_type = size_t;
-
-  static constexpr std::size_t size() noexcept { return sizeof...(Nums); }
-};
-
-namespace detail
-{
-
-template <size_t v, typename Seq> struct _append;
-
-template <size_t v, size_t... Nums> struct _append<v, index_sequence<Nums...>>
-{
-  using result = index_sequence<Nums..., v>;
-};
-
-template <size_t Len> struct make_index_sequence
-{
-  using result =
-      typename detail::_append<Len - 1, typename make_index_sequence<Len - 1>::result>::result;
-};
-
-template <> struct make_index_sequence<1>
-{
-  using result = index_sequence<0>;
-};
-
-template <> struct make_index_sequence<0>
-{
-  using result = index_sequence<>;
-};
-
-} // namespace detail
-
-template <size_t Num> using make_index_sequence = typename detail::make_index_sequence<Num>::result;
-
-} // namespace cxx14
-
-} // namespace ndarray
-
-#endif //_NDARRAY_CXX14_H_
diff --git a/runtime/libs/ndarray/test/CMakeLists.txt b/runtime/libs/ndarray/test/CMakeLists.txt
deleted file mode 100644
index 16f8779ee..000000000
--- a/runtime/libs/ndarray/test/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-if(NOT BUILD_NDARRAY_TEST)
-    return()
-endif()
-
-add_executable(ndarray_test ndarray_test.cpp)
-
-target_link_libraries(ndarray_test PRIVATE ndarray)
-
-nnfw_find_package(GTest)
-if(NOT GTest_FOUND)
-    message(STATUS "GTest not avaialble. Skipping NDArray test build")
-    return()
-endif(NOT GTest_FOUND)
-
-target_link_libraries(ndarray_test PUBLIC gtest gtest_main ${LIB_PTHREAD})
-
-add_test(ndarray_test ndarray_test)
diff --git a/runtime/libs/ndarray/test/ndarray_test.cpp b/runtime/libs/ndarray/test/ndarray_test.cpp
deleted file mode 100644
index 0aa948c72..000000000
--- a/runtime/libs/ndarray/test/ndarray_test.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "ndarray/Array.h"
-
-using namespace ndarray;
-
-TEST(NDArray_tests, basic_data_test)
-{
-
-  float raw_data[] = {1, 2, 3, 4};
-
-  Array<float> data22{raw_data, {2, 2}};
-
-  ASSERT_FLOAT_EQ(data22.at(0, 0), 1);
-  ASSERT_FLOAT_EQ(data22.at(0, 1), 2);
-  ASSERT_FLOAT_EQ(data22.at(1, 0), 3);
-  ASSERT_FLOAT_EQ(data22.at(1, 1), 4);
-
-  Array<float> data14{raw_data, {1, 4}};
-  ASSERT_FLOAT_EQ(data22.at(0, 0), 1);
-  ASSERT_FLOAT_EQ(data22.at(0, 1), 2);
-  ASSERT_FLOAT_EQ(data22.at(0, 2), 3);
-  ASSERT_FLOAT_EQ(data22.at(0, 3), 4);
-}
-
-TEST(NDArray_tests, slice_write_test)
-{
-  float raw_data[4] = {0};
-
-  Array<float> data22{raw_data, {2, 2}};
-
-  data22.slice(1) = {1, 2};
-
-  ASSERT_FLOAT_EQ(data22.at(0, 0), 0);
-  ASSERT_FLOAT_EQ(data22.at(0, 1), 0);
-  ASSERT_FLOAT_EQ(data22.at(1, 0), 1);
-  ASSERT_FLOAT_EQ(data22.at(1, 1), 2);
-}
-
-TEST(NDArray_tests, slice_read_test)
-{
-  float raw_data[4] = {1, 2, 3, 4};
-
-  Array<float> data22{raw_data, {2, 2}};
-
-  auto slice = data22.slice(1);
-
-  ASSERT_FLOAT_EQ(slice[0], 3);
-  ASSERT_FLOAT_EQ(slice[1], 4);
-}
-
-TEST(NDArray_tests, multidim_test)
-{
-  float raw_data[5] = {0, 1, 2, 3, 4};
-
-  Array<float> data22{raw_data, {1, 1, 1, 1, 5}};
-
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 0), 0);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 1), 1);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 2), 2);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 3), 3);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 4), 4);
-}
-
-TEST(NDArray_tests, slice_assign_test)
-{
-  std::vector<float> v1{1, 2, 3, 4, 5};
-  std::vector<float> v2(5);
-
-  ContiguousSpan<float> span1(v1.begin(), v1.end());
-  ContiguousSpan<float> span2(v2.begin(), v2.end());
-
-  span2.assign(span1);
-
-  ASSERT_EQ(v1, v2);
-}
diff --git a/runtime/libs/nnapi/CMakeLists.txt b/runtime/libs/nnapi/CMakeLists.txt
index a5d9490d1..73f82b909 100644
--- a/runtime/libs/nnapi/CMakeLists.txt
+++ b/runtime/libs/nnapi/CMakeLists.txt
@@ -1,3 +1,4 @@
-add_subdirectories()
+add_library(nnfw_lib_nnapi INTERFACE)
 
-add_library(nnfw_lib_nnapi ALIAS nnfw_lib_nnapi_1_2)
+target_include_directories(nnfw_lib_nnapi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_link_libraries(nnfw_lib_nnapi INTERFACE nnfw-nnapi-header)
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksExShim.h b/runtime/libs/nnapi/include/NeuralNetworksExShim.h
index 855613241..855613241 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksExShim.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksExShim.h
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksLoadHelpers.h b/runtime/libs/nnapi/include/NeuralNetworksLoadHelpers.h
index 1c482b54c..1c482b54c 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksLoadHelpers.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksLoadHelpers.h
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksShim.h b/runtime/libs/nnapi/include/NeuralNetworksShim.h
index 80082383f..80082383f 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksShim.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksShim.h
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksTypes.h b/runtime/libs/nnapi/include/NeuralNetworksTypes.h
index d74402749..d74402749 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksTypes.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksTypes.h
diff --git a/runtime/libs/nnapi/v1.1/CMakeLists.txt b/runtime/libs/nnapi/v1.1/CMakeLists.txt
deleted file mode 100644
index dc018c60f..000000000
--- a/runtime/libs/nnapi/v1.1/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_library(nnfw_lib_nnapi_1_1 INTERFACE)
-
-target_include_directories(nnfw_lib_nnapi_1_1 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(nnfw_lib_nnapi_1_1 INTERFACE nnfw-nnapi-header)
diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h
deleted file mode 100644
index f684dab90..000000000
--- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * @file     NeuralNetworksExShim.h
- * @brief    This file contains an actual implementation of
- *           ANeuralNetworksModel_addOperationEx function
- * @ingroup  COM_AI_RUNTIME
- */
-
-#ifndef NN_API_EX_SHIM_H
-#define NN_API_EX_SHIM_H
-
-#include "NeuralNetworksEx.h"
-#include "NeuralNetworksLoadHelpers.h"
-
-typedef int (*ANeuralNetworksModel_addOperationEx_fn)(ANeuralNetworksModel *model,
-                                                      ANeuralNetworksOperationTypeEx type,
-                                                      uint32_t inputCount, const uint32_t *inputs,
-                                                      uint32_t outputCount,
-                                                      const uint32_t *outputs);
-
-/**
- * @brief Add an extended operation to a model.
- *
- * @param[in] model The model to be modified.
- * @param[in] type The type of extended operation.
- * @param[in] inputCount The number of entries in the inputs array.
- * @param[in] inputs An array of indexes identifying each operand.
- * @param[in] outputCount The number of entries in the outputs array.
- * @param[in] outputs An array of indexes identifying each operand.
- *
- * @note The operands specified by inputs and outputs must have been
- *       previously added by calls to {@link ANeuralNetworksModel_addOperand}.\n
- *       Attempting to modify a model once {@link ANeuralNetworksModel_finish}
- *       has been called will return an error.\n
- *       See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-
-inline int ANeuralNetworksModel_addOperationEx(ANeuralNetworksModel *model,
-                                               ANeuralNetworksOperationTypeEx type,
-                                               uint32_t inputCount, const uint32_t *inputs,
-                                               uint32_t outputCount, const uint32_t *outputs)
-{
-  LOAD_FUNCTION(ANeuralNetworksModel_addOperationEx);
-  EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount, outputs);
-}
-
-#endif // NN_API_EX_SHIM_H
diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h
deleted file mode 100644
index 201465f9c..000000000
--- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// NOTE To minimize diff with upstream tensorflow, disable clang-format
-// clang-format off
-
-// NOTE This header is derived from part of the following file (in TensorFlow v1.12)
-//       'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h'
-
-/**
- * @file NeuralNetworksLoadHelpers.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains functions to load NN API runtime library
- */
-
-#ifndef __NEURAL_NETWORKS_LOAD_HELPER_H__
-#define __NEURAL_NETWORKS_LOAD_HELPER_H__
-
-#include <dlfcn.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/**
- * @brief Print log data
- * @param[in] format    Format string of @c printf
- * @param[in] args      Argument after format string. (Same with @c printf)
- */
-#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__);
-
-/**
- * @brief Create a function pointer named @c fn after loading NN API library
- * @param[in] name    Name of a function
- */
-#define LOAD_FUNCTION(name) \
-  static name##_fn fn = reinterpret_cast<name##_fn>(nnfw::loadFunction(#name));
-
-/**
- * @brief Run @c fn function. @c fn is created by @ref LOAD_FUNCTION
- * @param[in] args    List of arguments for the function @c fn
- */
-#define EXECUTE_FUNCTION(...) \
-  if (fn != nullptr) {        \
-    fn(__VA_ARGS__);          \
-  }
-
-/**
- * @brief Run @c fn function. @c fn is created by @ref LOAD_FUNCTION
- * @param[in] args    List of arguments for the function @c fn
- * @return            the return value of @c fn
- */
-#define EXECUTE_FUNCTION_RETURN(...) return fn != nullptr ? fn(__VA_ARGS__) : 0;
-
-namespace nnfw
-{
-
-/**
- * @brief Load NN API library
- * @param[in] name path of NN API library
- * @return a symbol table handle of NN API library
- */
-inline void* loadLibrary(const char* name) {
-  // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn
-  // api RT
-  void* handle = nullptr;
-#if 1 //#ifdef __ANDROID__
-  handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-  if (handle == nullptr) {
-    NNAPI_LOG("nnapi error: unable to open library %s", name);
-    NNAPI_LOG("             %s", dlerror());
-  }
-#endif
-  return handle;
-}
-
-/**
- * @brief Load libneuralnetworks.so and return handle of library
- * @return a symbol table handle of NN API library
- */
-inline void* getLibraryHandle() {
-  static void* handle = loadLibrary("libneuralnetworks.so");
-  return handle;
-}
-
-/**
- * @brief Return function ptr in libneuralnetworks.so
- * @param[in] name    Name of function
- * @return function pointer
- */
-inline void* loadFunction(const char* name) {
-  void* fn = nullptr;
-  if (getLibraryHandle() != nullptr) {
-    fn = dlsym(getLibraryHandle(), name);
-  }
-  if (fn == nullptr) {
-    NNAPI_LOG("nnapi error: unable to open function %s", name);
-    NNAPI_LOG("             %s", dlerror());
-    abort();
-  }
-  else {
-#ifdef _GNU_SOURCE
-    Dl_info info;
-    if (dladdr(fn, &info))
-    {
-      NNAPI_LOG("nnapi function '%s' is loaded from '%s' ", name, info.dli_fname);
-    }
-    else
-    {
-      NNAPI_LOG("nnapi function '%s' is failed to load", name);
-    }
-
-#endif // _GNU_SOURCE
-  }
-  return fn;
-}
-
-/**
- * @brief Check if libneuralnetworks.so can be loaded
- * @return @c true if loading is successful, otherwise @c false.
- */
-inline bool NNAPIExists() {
-  static bool nnapi_is_available = getLibraryHandle();
-  return nnapi_is_available;
-}
-
-} // namespace nnfw
-
-#endif // __NEURAL_NETWORKS_LOAD_HELPER_H__
diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h
deleted file mode 100644
index 60b16f766..000000000
--- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// NOTE To minimize diff with upstream tensorflow, disable clang-format
-// clang-format off
-
-// NOTE This header is derived from part of the following file (in TensorFlow v1.12)
-//       'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h'
-#ifndef __NEURAL_NETWORKS_SHIM__
-#define __NEURAL_NETWORKS_SHIM__
-
-#include "NeuralNetworks.h"
-#include "NeuralNetworksLoadHelpers.h"
-
-// nn api function types
-
-typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
-    size_t size, int protect, int fd, size_t offset,
-    ANeuralNetworksMemory** memory);
-
-typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
-
-typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
-
-typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
-
-typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
-
-typedef int (*ANeuralNetworksCompilation_create_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
-
-typedef void (*ANeuralNetworksCompilation_free_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
-    ANeuralNetworksCompilation* compilation, int32_t preference);
-
-typedef int (*ANeuralNetworksCompilation_finish_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksModel_addOperand_fn)(
-    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
-
-typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
-    ANeuralNetworksModel* model, int32_t index, const void* buffer,
-    size_t length);
-
-typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
-    ANeuralNetworksModel* model, int32_t index,
-    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksModel_addOperation_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
-    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
-    const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
-    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
-    uint32_t outputCount, const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
-    ANeuralNetworksModel* model, bool allow);
-
-typedef int (*ANeuralNetworksExecution_create_fn)(
-    ANeuralNetworksCompilation* compilation,
-    ANeuralNetworksExecution** execution);
-
-typedef void (*ANeuralNetworksExecution_free_fn)(
-    ANeuralNetworksExecution* execution);
-
-typedef int (*ANeuralNetworksExecution_setInput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_startCompute_fn)(
-    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
-
-typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
-
-typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
-
-/**
- * Creates a shared memory object from a file descriptor.
- *
- * The shared memory is backed by a file descriptor via mmap.
- * See {@link ANeuralNetworksMemory} for a description on how to use
- * this shared memory.
- *
- * @param size The requested size in bytes.
- *             Must not be larger than the file size.
- * @param prot The desired memory protection for the mapping.
- *             It is either PROT_NONE or the bitwise OR of one or
- *             more of the following flags: PROT_READ, PROT_WRITE.
- * @param fd The requested file descriptor.
- *           The file descriptor has to be mmap-able. The file
- *           descriptor will be duplicated.
- * @param offset The offset to the beginning of the file of the area to map.
- *               The offset has to be aligned to a page size.
- * @param memory The memory object to be created.
- *               Set to NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
- */
-inline int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd,
-                                              size_t offset,
-                                              ANeuralNetworksMemory** memory) {
-  LOAD_FUNCTION(ANeuralNetworksMemory_createFromFd);
-  EXECUTE_FUNCTION_RETURN(size, protect, fd, offset, memory);
-}
-
-/**
- * Delete a memory object.
- *
- * Destroys the object used by the run time to keep track of the memory.
- * This will free the underlying actual memory if no other code has open
- * handles to this memory.
- *
- * @param memory The memory object to be freed.
- */
-inline void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
-  LOAD_FUNCTION(ANeuralNetworksMemory_free);
-  EXECUTE_FUNCTION(memory);
-}
-
-/**
- * Create an empty {@link ANeuralNetworksModel}.
- *
- * <p>This only creates the object. Computation is performed once
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
- *
- * The model should be constructed with calls to
- * {@link ANeuralNetworksModel_addOperation} and
- * {@link ANeuralNetworksModel_addOperand}
- *
- * <p>{@link ANeuralNetworksModel_finish} should be called once the model
- * has been fully constructed.</p>
- *
- * <p>{@link ANeuralNetworksModel_free} should be called once the model
- * is no longer needed.</p>
- *
- * @param model The {@link ANeuralNetworksModel} to be created.
- *              Set to NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
-  LOAD_FUNCTION(ANeuralNetworksModel_create);
-  EXECUTE_FUNCTION_RETURN(model);
-}
-
-/**
- * Destroy a model.
- *
- * The model need not have been finished by a call to
- * {@link ANeuralNetworksModel_finish}.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be destroyed. Passing NULL is acceptable and
- *              results in no operation.
- */
-inline void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
-  LOAD_FUNCTION(ANeuralNetworksModel_free);
-  EXECUTE_FUNCTION(model);
-}
-
-/**
- * Indicate that we have finished modifying a model. Required before
- * calling {@link ANeuralNetworksCompilation_compile}.
- *
- * An application is responsible to make sure that no other thread uses
- * the model at the same time.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be finished.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
-  LOAD_FUNCTION(ANeuralNetworksModel_finish);
-  EXECUTE_FUNCTION_RETURN(model);
-}
-
-/**
- * Add an operand to a model.
- *
- * The order in which the operands are added is important. The first one added
- * to a model will have the index value 0, the second 1, etc. These indexes are
- * used as operand identifiers in {@link ANeuralNetworksModel_addOperation},
- * {@link ANeuralNetworksExecution_setInput},
- * {@link ANeuralNetworksExecution_setInputFromMemory},
- * {@link ANeuralNetworksExecution_setOutput},
- * {@link ANeuralNetworksExecution_setOutputFromMemory} and
- * {@link ANeuralNetworksExecution_setOperandValue}.
- *
- * To build a model that can accommodate inputs of various sizes, as you may
- * want to do for a CNN, set the size of the dimensions that will vary at run
- * time to 0. If you do so, provide the full dimensions when calling
- * {@link ANeuralNetworksExecution_setInput} or {@link
- * ANeuralNetworksExecution_setInputFromMemory}.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be modified.
- * @param type The {@link ANeuralNetworksOperandType} that describes the shape
- * of the operand.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_addOperand(
-    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) {
-  LOAD_FUNCTION(ANeuralNetworksModel_addOperand);
-  EXECUTE_FUNCTION_RETURN(model, type);
-}
-
-/**
- * Sets an operand to a constant value.
- *
- * For scalar values, the content of buffer is copied into the model.
- *
- * For tensor values, a pointer to the buffer is stored within the model.
- * The application is responsible for not changing the content of this region
- * until all executions using this model have completed. As the data may
- * be copied during processing, modifying the data after this call yields
- * undefined results.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param buffer A pointer to the data to use.
- * @param length The size in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model,
-                                                int32_t index,
-                                                const void* buffer,
-                                                size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValue);
-  EXECUTE_FUNCTION_RETURN(model, index, buffer, length);
-}
-
-/**
- * Sets an operand to a value stored in a memory object.
- *
- * The content of the memory is not copied. A reference to that memory is stored
- * inside the model. The application is responsible for not changing the content
- * of the memory region until all executions using this model have completed.
- * As the data may be copied during processing, modifying the data after this
- * call yields undefined results.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param buffer A pointer to the data to use.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory.
- *               The offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_setOperandValueFromMemory(
-    ANeuralNetworksModel* model, int32_t index,
-    const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValueFromMemory);
-  EXECUTE_FUNCTION_RETURN(model, index, memory, offset, length);
-}
-
-/**
- * Add an operation to a model.
- *
- * @param model The model to be modified.
- * @param type The type of the operation.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying each operand.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying each operand.
- *
- * The operands specified by inputs and outputs must have been
- * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
-                                             ANeuralNetworksOperationType type,
-                                             uint32_t inputCount,
-                                             const uint32_t* inputs,
-                                             uint32_t outputCount,
-                                             const uint32_t* outputs) {
-  LOAD_FUNCTION(ANeuralNetworksModel_addOperation);
-  EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount,
-                          outputs);
-}
-
-/**
- * Specifies which operands will be the model's inputs and outputs.
- *
- * An operand cannot be used for both input and output. Doing so will
- * return an error.
- *
- * @param model The model to be modified.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying the input operands.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying the output operands.
- *
- * The operands specified by inputs and outputs must have been
- * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- */
-inline int ANeuralNetworksModel_identifyInputsAndOutputs(
-    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
-    uint32_t outputCount, const uint32_t* outputs) {
-  LOAD_FUNCTION(ANeuralNetworksModel_identifyInputsAndOutputs);
-  EXECUTE_FUNCTION_RETURN(model, inputCount, inputs, outputCount, outputs);
-}
-
-/**
- * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
- * calculated with range and/or precision as low as that of the IEEE 754 16-bit
- * floating-point format. By default, {@link ANEURALNETWORKS_TENSOR_FLOAT32}
- * must be calculated using at least the range and precision of the IEEE 754
- * 32-bit floating-point format.
- *
- * @param model The model to be modified.
- * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
- *              calculated with range and/or precision as low as that of the
- *              IEEE 754 16-bit floating point format. 'false' indicates
- *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using
- *              at least the range and precision of the IEEE 754 32-bit floating
- *              point format.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * Available since API level 28.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- */
-inline int ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-    ANeuralNetworksModel* model, bool allow) {
-  LOAD_FUNCTION(ANeuralNetworksModel_relaxComputationFloat32toFloat16);
-  EXECUTE_FUNCTION_RETURN(model, allow);
-}
-
-/**
- * Create a {@link ANeuralNetworksCompilation} to compile the given model.
- * This only creates the object. Compilation is only performed once
- * {@link ANeuralNetworksCompilation_start} is invoked.
- *
- * <p>The provided model must outlive the compilation.</p>
- *
- * The model must already have been finished by a call to
- * {@link ANeuralNetworksModel_finish}.
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @param model The {@link ANeuralNetworksModel} to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
- *         if the model is invalid.
- */
-inline int ANeuralNetworksCompilation_create(
-    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_create);
-  EXECUTE_FUNCTION_RETURN(model, compilation);
-}
-
-/**
- * Destroy a compilation.
- *
- * <p>If called on a compilation for which
- * {@link ANeuralNetworksCompilation_start} has been called, the
- * function will return immediately but will mark the compilation to be deleted
- * once the compilation completes. The {@link ANeuralNetworksCompilation_wait}
- * will return ERROR_DELETED.
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @param compilation The compilation to be destroyed. Passing NULL is
- * acceptable and results in no operation.
- */
-inline void ANeuralNetworksCompilation_free(
-    ANeuralNetworksCompilation* compilation) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_free);
-  EXECUTE_FUNCTION(compilation);
-}
-
-/**
- * Sets the execution preference.
- *
- * <p>Provides guidance to the runtime when trade-offs are possible.</p>
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @param compilation The compilation to be modified.
- * @param preference Either {@link PREFER_LOW_POWER},
- *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
- *                  {@link PREFER_SUSTAINED_SPEED}.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksCompilation_setPreference(
-    ANeuralNetworksCompilation* compilation, int32_t preference) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_setPreference);
-  EXECUTE_FUNCTION_RETURN(compilation, preference);
-}
-
-/**
- * Waits until the compilation completes.
- *
- * More than one thread can wait on a compilation. When the compilation
- * completes, all threads will be released.
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
- */
-inline int ANeuralNetworksCompilation_finish(
-    ANeuralNetworksCompilation* compilation) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_finish);
-  EXECUTE_FUNCTION_RETURN(compilation);
-}
-/**
- * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
- * This only creates the object. Computation is only performed once
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
- *
- * <p>The provided compilation must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
- * @param execution The newly created object or NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
- *         if the compilation is invalid.
- */
-inline int ANeuralNetworksExecution_create(
-    ANeuralNetworksCompilation* compilation,
-    ANeuralNetworksExecution** execution) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_create);
-  EXECUTE_FUNCTION_RETURN(compilation, execution);
-}
-
-/**
- * Destroy an execution.
- *
- * <p>If called on an execution for which
- * {@link ANeuralNetworksExecution_startCompute} has been called, the
- * function will return immediately but will mark the execution to be deleted
- * once the computation completes.   The {link ANeuralNetworksExecution_wait}
- * will return ANEURALNETWORKS_ERROR_DELETED.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be destroyed. Passing NULL is acceptable
- * and results in no operation.
- */
-inline void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_free);
-  EXECUTE_FUNCTION(execution);
-}
-
-/**
- * Associate a user buffer with an input of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided buffer must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This should be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other properties of the type must be the same as
- *             specified in the model. If the type is the same as specified
- *             when the model was built, NULL can be passed.
- * @param buffer The buffer containing the data.
- * @param length The length in bytes of the buffer.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the input.
- */
-inline int ANeuralNetworksExecution_setInput(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const void* buffer, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setInput);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
-}
-
-/**
- * Associate part of a memory object with an input of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided memory must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This can be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other values must be the same as specified in the
- *             model. If the type is the same as specified when the model
- *             was built, NULL can be passed.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory.
- *               The offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the input.
- */
-inline int ANeuralNetworksExecution_setInputFromMemory(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setInputFromMemory);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
-}
-
-/**
- * Associate a user buffer with an output of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided buffer must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This can be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other values must be the same as specified in the
- *             model. If the type is the same as specified when the model
- *             was built, NULL can be passed.
- * @param buffer The buffer where the data is to be written.
- * @param length The length in bytes of the buffer.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the output.
- */
-inline int ANeuralNetworksExecution_setOutput(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, void* buffer, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setOutput);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
-}
-
-/**
- * Associate part of a memory object with an output of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided memory must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This can be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other values must be the same as specified in the
- *             model. If the type is the same as specified when the model
- *             was built, NULL can be passed.
- * @param memory The memory where the data is to be stored.
- * @param offset This specifies the location of the data within the memory.
- *               The offset is in bytes from the start of memory.
- * @param length The length in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the output.
- */
-inline int ANeuralNetworksExecution_setOutputFromMemory(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setOutputFromMemory);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
-}
-
-/**
- * Schedule evaluation of the execution.
- *
- * <p>Schedules evaluation of the execution. Once the model has been
- * applied and the outputs are ready to be consumed, the execution will be
- * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that signal.
- * </p>
- *
- * Multiple executions can be scheduled and evaluated concurrently, and
- * compilations can be performed concurrently with executions. The runtime makes
- * no guarantee on the ordering of the completion of compilations and
- * executions. If it's important to the application, the application should
- * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
- * {@link ANeuralNetworksExecution_wait}.
- *
- * ANeuralNetworksExecution_wait must be called to recuperate the resources used
- * by the execution.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be scheduled and executed.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksExecution_startCompute(
-    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_startCompute);
-  EXECUTE_FUNCTION_RETURN(execution, event);
-}
-
-/**
- * Waits until the execution completes.
- *
- * More than one thread can wait on an event. When the execution completes,
- * all threads will be released.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
- */
-inline int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
-  LOAD_FUNCTION(ANeuralNetworksEvent_wait);
-  EXECUTE_FUNCTION_RETURN(event);
-}
-
-/**
- * Destroys the event.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- */
-inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
-  LOAD_FUNCTION(ANeuralNetworksEvent_free);
-  EXECUTE_FUNCTION(event);
-}
-
-#endif  // __NEURAL_NETWORKS_SHIM__
diff --git a/runtime/libs/nnapi/v1.2/CMakeLists.txt b/runtime/libs/nnapi/v1.2/CMakeLists.txt
deleted file mode 100644
index 21ec3015f..000000000
--- a/runtime/libs/nnapi/v1.2/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_library(nnfw_lib_nnapi_1_2 INTERFACE)
-
-target_include_directories(nnfw_lib_nnapi_1_2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(nnfw_lib_nnapi_1_2 INTERFACE nnfw-nnapi-header)
diff --git a/runtime/nnapi-header/include/NeuralNetworks.h b/runtime/nnapi-header/include/NeuralNetworks.h
index 7400806d8..0c54d7582 100644
--- a/runtime/nnapi-header/include/NeuralNetworks.h
+++ b/runtime/nnapi-header/include/NeuralNetworks.h
@@ -24,8 +24,8 @@
  * @file NeuralNetworks.h
  */
 
-#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
-#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H
 
 /******************************************************************
  *
@@ -43,16 +43,14 @@
  *   - DO NOT CHANGE THE LAYOUT OR SIZE OF STRUCTURES
  */
 
-// For compatibility with android, check __ANDROID_API__ is defined
-// If __ANDROID_API__ is pre-defined, this header may be used for android
-#ifndef __ANDROID_API__
-#define __ANDROID_API__ 29
-#define __ANDROID_API_Q__ 29
+// For compatibility with android, check __ANDROID__ is defined
+#ifndef __ANDROID__
+#define __ANDROID_API__ 30
 #define __INTRODUCED_IN(api_level)
 typedef struct AHardwareBuffer AHardwareBuffer;
 #else
 #include <android/hardware_buffer.h>
-#endif // __ANDROID_API__
+#endif // __ANDROID__
 #include <stddef.h>
 #include <stdint.h>
 #include <sys/cdefs.h>
@@ -62,7 +60,11 @@ __BEGIN_DECLS
 /**
  * Operand types.
  *
- * The type of operands that can be added to a model.
+ * The type of an operand in a model.
+ *
+ * Types prefaced with ANEURALNETWORKS_TENSOR_* must be used for tensor data (i.e., tensors
+ * with at least one dimension). Types not prefaced by ANEURALNETWORKS_TENSOR_* represent
+ * scalar values and must have no dimensions.
  *
  * Although we define many types, most operators accept just a few
  * types. Most used are {@link ANEURALNETWORKS_TENSOR_FLOAT32},
@@ -94,7 +96,6 @@ typedef enum {
      *   real_value = (integer_value - zeroPoint) * scale.
      */
     ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
-#if __ANDROID_API__ >= __ANDROID_API_Q__
     /**
      * An 8 bit boolean scalar value.
      *
@@ -160,7 +161,6 @@ typedef enum {
      * Available since API level 29.
      */
     ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
-
     /**
      * A tensor of 16 bit unsigned integers that represent real numbers.
      *
@@ -175,7 +175,6 @@ typedef enum {
      * Available since API level 29.
      */
     ANEURALNETWORKS_TENSOR_QUANT16_ASYMM = 12,
-
     /**
      * A tensor of 8 bit signed integers that represent real numbers.
      *
@@ -188,14 +187,36 @@ typedef enum {
      * Available since API level 29.
      */
     ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+    /**
+     * A tensor of 8 bit signed integers that represent real numbers.
+     *
+     * Attached to this tensor are two numbers that can be used to convert the
+     * 8 bit integer to the real value and vice versa. These two numbers are:
+     * - scale: a 32 bit floating point value greater than zero.
+     * - zeroPoint: a 32 bit integer, in range [-128, 127].
+     *
+     * The formula is:
+     * real_value = (integer_value - zeroPoint) * scale.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED = 14,
 
+    /**
+     * A reference to a model.
+     *
+     * {@link ANeuralNetworksModel_setOperandValueFromModel} must be used to set
+     * the value for an Operand of this type.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_MODEL = 15,
 } OperandCode;
 
 /**
  * Operation types.
  *
- * The type of operations that can be added to a model.
+ * The type of an operation in a model.
  *
  * Available since API level 27.
  */
@@ -231,6 +252,8 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -238,15 +261,19 @@ typedef enum {
      * * 0: A tensor.
      * * 1: A tensor of the same {@link OperandCode}, and compatible dimensions
      *      as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scales and zeroPoint can be different from input0 scale and zeroPoint.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: The sum, a tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 27.
@@ -270,18 +297,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -307,8 +336,8 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
      *      {@link PaddingCode} values.
@@ -330,7 +359,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -346,8 +376,9 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API
-     *   level 29, see the input section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *   (full support since API level 29, see the input section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -357,6 +388,9 @@ typedef enum {
      *            Before API level 29, all input tensors of
      *            {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
      *            must have the same scale and zeroPoint as the output tensor.
+     *            Input tensors of
+     *            {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *            are allowed to have different scale and zeroPoint.
      *            Since API level 29, zero-sized tensors are supported.
      * * n: An {@link ANEURALNETWORKS_INT32} scalar, specifying the
      *      concatenation axis.
@@ -373,7 +407,7 @@ typedef enum {
     ANEURALNETWORKS_CONCATENATION = 2,
 
     /**
-     * Performs an 2-D convolution operation.
+     * Performs a 2-D convolution operation.
      *
      * The CONV_2D op sweeps a 2-D filter that can mix channels together over a
      * batch of images, applying the filter to each window of each image of the
@@ -409,31 +443,46 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * Available since API level 30:
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
-     *      filter. For tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      filter.
+     *      For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
@@ -466,22 +515,25 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
-     *      filter. For tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      filter.
+     *      For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same
+     *      type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
@@ -509,10 +561,9 @@ typedef enum {
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
-     *      [batches, out_height, out_width, depth_out]. Before API level 29,
-     *      for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
-     *      the following condition must be satisfied:
-     *      output_scale > input_scale * filter_scale
+     *      [batches, out_height, out_width, depth_out].
+     *      Before API level 29, for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      the following condition must be satisfied: output_scale > input_scale * filter_scale
      *
      * Available since API level 27.
      */
@@ -559,10 +610,23 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * Available since API level 30:
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
@@ -570,18 +634,20 @@ typedef enum {
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
      *      specifying the input.
      * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out],
-     *      specifying the filter. For tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 3.
+     *      specifying the filter.
+     *      For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 3.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
@@ -620,14 +686,15 @@ typedef enum {
      * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out],
      *      specifying the filter.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
@@ -654,12 +721,11 @@ typedef enum {
      *      cells between each filter element on height dimension. If this input is set,
      *      input 9 (dilation factor for width) must be specified as well.
      *      Available since API level 29.
-
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
-     *      [batches, out_height, out_width, depth_out]. Before API level 29,
-     *      for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      [batches, out_height, out_width, depth_out]. Before API level 29, for
+     *      output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the following condition must be satisfied:
      *      output_scale > input_scale * filter_scale
      *
@@ -686,11 +752,13 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
@@ -705,7 +773,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape [batch, height*block_size,
      *      width*block_size, depth/(block_size*block_size)].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -723,6 +792,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported output tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
@@ -731,7 +801,8 @@ typedef enum {
      * Supported tensor rank: up to 4
      *
      * Inputs:
-     * * 0: A tensor. Since API level 29, this tensor may be zero-sized.
+     * * 0: A tensor.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: A tensor with the same shape as input0.
@@ -761,9 +832,11 @@ typedef enum {
      * and an error must be reported.
      *
      * Supported value tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 30)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_INT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported value tensor rank: from 2
      *
@@ -777,7 +850,8 @@ typedef enum {
      * * 0: A n-D tensor with the same rank and shape as the Values
      *      tensor, except for the first dimension which has the same size
      *      as Lookups' only dimension.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input1.
      *
      * Available since API level 27.
@@ -816,6 +890,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
@@ -826,26 +901,26 @@ typedef enum {
      *      [batch_size, input_size], where "input_size" corresponds to the
      *      number of inputs to the layer, matching the second dimension of
      *      weights, and "batch_size" is calculated by dividing the number of
-     *      elements by "input_size". Since API level 29, zero batch_size is
-     *      supported for this tensor.
+     *      elements by "input_size".
+     *      Since API level 29, zero batch_size is supported for this tensor.
      * * 1: A 2-D tensor, specifying the weights, of shape
      *      [num_units, input_size], where "num_units" corresponds to the number
      *      of output nodes.
      * * 2: A 1-D tensor, of shape [num_units], specifying the bias. For input
      *      tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, the bias should
-     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be
-     *      of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
-     *      bias_scale == input_scale * filter_scale.
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      with zeroPoint of 0 and bias_scale == input_scale * filter_scale.
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
      *
      * Outputs:
-     * * 0: The output tensor, of shape [batch_size, num_units]. Before API
-     *      level 29, for output tensor of {@link
-     *      ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the following condition must
-     *      be satisfied: output_scale > input_scale * filter_scale.
+     * * 0: The output tensor, of shape [batch_size, num_units]. Before API level 29, for
+     *      output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the following
+     *      condition must be satisfied: output_scale > input_scale * filter_scale.
      *
      * Available since API level 27.
      */
@@ -911,7 +986,7 @@ typedef enum {
     ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
 
     /**
-     * Applies L2 normalization along the depth dimension.
+     * Applies L2 normalization along the axis dimension.
      *
      * The values in the output tensor are computed as:
      *
@@ -919,13 +994,13 @@ typedef enum {
      *         input[batch, row, col, channel] /
      *         sqrt(sum_{c} pow(input[batch, row, col, c], 2))
      *
-     * For input tensor with rank less than 4, independently normalizes each
-     * 1-D slice along dimension dim.
+     * By default the axis dimension is the last dimension of the input tensor.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      * Tensors with rank less than 4 are only supported since API level 29.
@@ -942,6 +1017,12 @@ typedef enum {
      * * 0: A tensor of the same {@link OperandCode} and same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 128 and the zeroPoint must be 128.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 128 and the zeroPoint must be 0.
+     *
+     *      NOTE: Before API level 30, if the elements along an axis are all zeros,
+     *      the result is undefined. Since API level 30, if the elements along an axis
+     *      are all zeros, the result is logical zero.
      *
      * Available since API level 27.
      */
@@ -967,13 +1048,14 @@ typedef enum {
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -999,8 +1081,8 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
      *      {@link PaddingCode} values.
@@ -1095,17 +1177,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 256 and the zeroPoint must be 0.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 256 and the zeroPoint must be -128.
      *
      * Available since API level 27.
      */
@@ -1158,7 +1243,7 @@ typedef enum {
      * Outputs:
      * * 0: If the projection type is Sparse:
      *      Output.Dim == { Tensor[0].Dim[0] }
-     *      A tensor of int32 that represents hash signatures,
+     *      A tensor of int32 that represents hash signatures.
      *
      *      If the projection type is Dense:
      *      Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
@@ -1248,7 +1333,7 @@ typedef enum {
      * * The projection bias (\f$b_{proj}\f$) may (but not required to) have a
      *   value if the recurrent projection layer exists, and should otherwise
      *   have no value.
-     * * (API level >= 29) The four layer normalization weights either all have
+     * * (API level 29 or later) The four layer normalization weights either all have
      *   values or none of them have values. Additionally, if CIFG is used,
      *   input layer normalization weights tensor is omitted and the other layer
      *   normalization weights either all have values or none of them have
@@ -1406,18 +1491,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -1443,8 +1530,8 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
      *      {@link PaddingCode} values.
@@ -1466,7 +1553,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1496,6 +1584,8 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -1506,10 +1596,13 @@ typedef enum {
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: The product, a tensor of the same {@link OperandCode} as input0.
-     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the following condition must be satisfied:
      *      output_scale > input1_scale * input2_scale.
      *
@@ -1528,16 +1621,18 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1555,16 +1650,18 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of the same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1582,16 +1679,18 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1608,6 +1707,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
@@ -1624,7 +1724,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: The output tensor, of shape specified by the input shape.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1642,18 +1743,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both resizing by shape and resizing by scale are supported.
      *
      * Inputs (resizing by shape):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the output
      *      width of the output tensor.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the output
@@ -1661,6 +1764,17 @@ typedef enum {
      * * 3: An optional {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
      *      Available since API level 29.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Inputs (resizing by scale, since API level 29):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
@@ -1679,10 +1793,24 @@ typedef enum {
      *      {@link ANEURALNETWORKS_FLOAT32} otherwise.
      * * 3: An optional {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, new_height, new_width, depth].
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
+     *      the scale and zeroPoint must be the same as input0.
      *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
@@ -1762,19 +1890,21 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      * Tensors with rank other than 2 or 4 are only supported since API level 29.
      *
      * Inputs:
-     * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped. Since
-     *      API level 29, this tensor may be zero-sized.
+     * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped.
+     *      Since API level 29, this tensor may be zero-sized.
      * * 1: A scalar, specifying the positive scaling factor for the exponent,
-     *      beta. If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the scalar must be of
-     *      {@link ANEURALNETWORKS_FLOAT32}. If input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT16}, then the scalar must be of {@link
-     *      ANEURALNETWORKS_FLOAT16}.
+     *      beta. If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, the scalar
+     *      must be of {@link ANEURALNETWORKS_FLOAT32}.
+     *      If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16}, then the
+     *      scalar must be of {@link ANEURALNETWORKS_FLOAT16}.
      * * 2: An optional {@link ANEURALNETWORKS_INT32} scalar, default to -1,
      *      specifying the dimension the activation would be performed on.
      *      Negative index is used to specify axis from the end (e.g. -1 for
@@ -1785,6 +1915,8 @@ typedef enum {
      * * 0: The output tensor of same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 256 and the zeroPoint must be 0.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 256 and the zeroPoint must be -128.
      *
      * Available since API level 27.
      */
@@ -1808,11 +1940,13 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
@@ -1827,7 +1961,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape [batches, height/block_size,
      *      width/block_size, depth_in*block_size*block_size].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1924,17 +2059,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 128 and the zeroPoint must be 128.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 128 and the zeroPoint must be 0.
      *
      * Available since API level 27.
      */
@@ -1942,7 +2080,6 @@ typedef enum {
 
     // Operations below are available since API level 28.
 
-    // TODO: make the description easier to understand.
     /**
      * BatchToSpace for N-dimensional tensors.
      *
@@ -1957,11 +2094,13 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: An n-D tensor, specifying the tensor to be reshaped
@@ -1974,7 +2113,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 28.
@@ -1988,6 +2128,11 @@ typedef enum {
      * dimensions. The output is the result of dividing the first input tensor
      * by the second, optionally modified by an activation function.
      *
+     * For inputs of {@link ANEURALNETWORKS_TENSOR_INT32}, performs
+     * "floor division" ("//" in Python). For example,
+     *     5 // 2 = 2
+     *    -5 // 2 = -3
+     *
      * Two dimensions are compatible when:
      *     1. they are equal, or
      *     2. one of them is 1
@@ -2008,6 +2153,7 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2018,6 +2164,8 @@ typedef enum {
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
@@ -2038,6 +2186,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2057,23 +2206,27 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
-     *      the scale and zeroPoint must be same as input0.
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
+     *      the scale and zeroPoint must be the same as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 28.
      */
     ANEURALNETWORKS_MEAN = 31,
 
     /**
-     * Pads a tensor with zeros.
+     * Pads a tensor.
      *
      * This operation pads a tensor according to the specified paddings.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API
-     *   level 29, see the output section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     *   (full support since API level 29, see the output section)
      *
      * Supported tensor rank: up to 4
      *
@@ -2095,7 +2248,8 @@ typedef enum {
      *      of the padding:
      *          output0.dimension[i] =
      *              padding[i, 0] + input0.dimension[i] + padding[i, 1]
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      *      NOTE: Before API level 29, the pad value for
@@ -2106,7 +2260,6 @@ typedef enum {
      */
     ANEURALNETWORKS_PAD = 32,
 
-    // TODO: make the description easier to understand.
     /**
      * SpaceToBatch for N-Dimensional tensors.
      *
@@ -2121,13 +2274,15 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API
-     *   level 29, see the output section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     *   (full support since API level 29, see the output section)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: An n-D tensor, specifying the input.
@@ -2148,7 +2303,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      *      NOTE: Before API level 29, the pad value for
@@ -2171,6 +2327,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2186,8 +2343,11 @@ typedef enum {
      * * 0: A tensor of the same {@link OperandCode} as input0. Contains the
      *      same data as input, but has one or more dimensions of size 1
      *      removed.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
+     *      If all input dimensions are equal to 1 and are to be squeezed, the
+     *      output shape is [1].
      *
      * Available since API level 28.
      */
@@ -2206,6 +2366,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2235,8 +2396,11 @@ typedef enum {
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0 and rank (n - k),
      *      where k is the number of bits set in shrink_axis_mask.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
+     *      If shrink_axis_mask is true for all input dimensions, the output
+     *      shape is [1].
      *
      * Available since API level 28.
      */
@@ -2270,6 +2434,8 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2280,10 +2446,13 @@ typedef enum {
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 28.
@@ -2303,6 +2472,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2314,7 +2484,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 28.
@@ -2329,6 +2500,7 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: from 1.
      *
@@ -2350,6 +2522,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -2361,6 +2534,7 @@ typedef enum {
      *
      * Outputs:
      * * 0: An (n - 1)-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor.
+     *      If input is 1-dimensional, the output shape is [1].
      *
      * Available since API level 29.
      */
@@ -2376,6 +2550,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -2387,6 +2562,7 @@ typedef enum {
      *
      * Outputs:
      * * 0: An (n - 1)-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor.
+     *      If input is 1-dimensional, the output shape is [1].
      *
      * Available since API level 29.
      */
@@ -2419,7 +2595,8 @@ typedef enum {
      *      and height, dw and dh is the log-scale relative correction factor
      *      for the width and height. For input0 of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, this tensor should be
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}. Zero num_rois is
+     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}. Zero num_rois is
      *      supported for this tensor.
      * * 2: An 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape
      *      [num_rois], specifying the batch index of each box. Boxes with
@@ -2441,7 +2618,54 @@ typedef enum {
     ANEURALNETWORKS_AXIS_ALIGNED_BBOX_TRANSFORM = 41,
 
     /**
-     * Performs a forward LSTM on the input followed by a backward LSTM.
+     * A recurrent neural network layer that applies an LSTM cell to a
+     * sequence of inputs in forward and backward directions.
+     *
+     * The op supports cross-linking via an auxiliary input. Regular cell feeds
+     * one input into the two RNN cells in the following way:
+     *
+     *       INPUT  (INPUT_REVERSED)
+     *         |         |
+     *    ---------------------
+     *    | FW_LSTM   BW_LSTM |
+     *    ---------------------
+     *         |         |
+     *      FW_OUT     BW_OUT
+     *
+     * An op with cross-linking takes two inputs and feeds them into the RNN
+     * cells in the following way:
+     *
+     *       AUX_INPUT   (AUX_INPUT_REVERSED)
+     *           |             |
+     *     INPUT | (INPUT_R'D.)|
+     *       |   |       |     |
+     *    -----------------------
+     *    |  \  /        \    / |
+     *    | FW_LSTM     BW_LSTM |
+     *    -----------------------
+     *         |           |
+     *      FW_OUT      BW_OUT
+     *
+     * The cross-linking mode is enabled iff auxiliary input and auxiliary
+     * weights are present. While stacking this op on top of itself, this
+     * allows to connect both forward and backward outputs from previous cell
+     * to the next cell's input.
+     *
+     * Since API level 30 parallel linking mode is supported. The mode is
+     * enabled if auxiliary input is present but auxiliary weights are omitted.
+     * In this case, the cell feeds inputs into the RNN in the following way:
+     *
+     *       INPUT (AUX_INPUT_REVERSED)
+     *         |         |
+     *    ---------------------
+     *    | FW_LSTM   BW_LSTM |
+     *    ---------------------
+     *         |         |
+     *      FW_OUT     BW_OUT
+     *
+     * While stacking this op on top of itself, this allows to connect both
+     * forward and backward outputs from previous cell to the next cell's
+     * corresponding inputs.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
@@ -2451,7 +2675,6 @@ typedef enum {
      *
      * All input and output tensors must be of the same type.
      *
-     *
      * Inputs:
      * * 0: The input.
      *      A 3-D tensor of shape:
@@ -2543,25 +2766,34 @@ typedef enum {
      * * 38: The backward input cell state.
      *       A 2-D tensor of shape [batch_size, bw_num_units].
      * * 39: The auxiliary input. Optional.
-     *       A 3-D tensor of shape [max_time, batch_size, input_size], where “batch_size”
-     *       corresponds to the batching dimension, and “input_size” is the size
-     *       of the input.
-     * * 40: The forward auxiliary input-to-input weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 41: The forward auxiliary input-to-forget weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 42: The forward auxiliary input-to-cell weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 43: The forward auxiliary input-to-output weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 44: The backward auxiliary input-to-input weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
-     * * 45: The backward auxiliary input-to-forget weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
-     * * 46: The backward auxiliary input-to-cell weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
-     * * 47: The backward auxiliary input-to-output weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
+     *       A 3-D tensor of shape [max_time, batch_size, aux_input_size],
+     *       where “batch_size” corresponds to the batching dimension, and
+     *       “aux_input_size” is the size of the auxiliary input. Optional. See
+     *       the docs above for the usage modes explanation.
+     * * 40: The forward auxiliary input-to-input weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 41: The forward auxiliary input-to-forget weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 42: The forward auxiliary input-to-cell weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 43: The forward auxiliary input-to-output weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 44: The backward auxiliary input-to-input weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
+     * * 45: The backward auxiliary input-to-forget weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
+     * * 46: The backward auxiliary input-to-cell weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
+     * * 47: The backward auxiliary input-to-output weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
      * * 48: The activation function.
      *       A value indicating the activation function:
      *       <ul>
@@ -2576,17 +2808,17 @@ typedef enum {
      *       then clipping is disabled.
      *       If all the input tensors have type {@link ANEURALNETWORKS_TENSOR_FLOAT32},
      *       this scalar must be of the type {@link ANEURALNETWORKS_FLOAT32},
-     *       otherwise if all the input tensors have the type {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be of type {@link
-     *       ANEURALNETWORKS_FLOAT16}.
+     *       otherwise if all the input tensors have the type
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be
+     *       of type {@link ANEURALNETWORKS_FLOAT16}.
      * * 50: The clipping threshold for the output from the
      *       projection layer, such that values are bound within
      *       [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      *       If all the input tensors have type {@link ANEURALNETWORKS_TENSOR_FLOAT32},
      *       this scalar must be of the type {@link ANEURALNETWORKS_FLOAT32},
-     *       otherwise if all the input tensors have the type {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be of type {@link
-     *       ANEURALNETWORKS_FLOAT16}.
+     *       otherwise if all the input tensors have the type
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be
+     *       of type {@link ANEURALNETWORKS_FLOAT16}.
      * * 51: merge_outputs
      *       An {@link ANEURALNETWORKS_BOOL} scalar specifying if the outputs
      *       from forward and backward cells should be merged.
@@ -2633,8 +2865,36 @@ typedef enum {
      *      A 3-D tensor of shape:
      *        If time-major: [max_time, batch_size, bw_output_size]
      *        If batch-major: [batch_size, max_time, bw_output_size]
+     * * 2: The forward activation state output.
+     *      A 2-D tensor of shape [batch_size, fw_output_size] containing an
+     *      activation state from the last time step in the sequence. This
+     *      output is optional and can be omitted. If this output is present
+     *      then outputs 3-5 must be present as well.
+     *      Available since API level 30.
+     * * 3: The forward cell state output.
+     *      A tensor of shape [batch_size, fw_cell_size] containing a cell state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted. If this output is present
+     *      then outputs 2, 4, 5 must be present as well.
+     *      Available since API level 30.
+     * * 4: The backward activation state output.
+     *      A 2-D tensor of shape [batch_size, bw_output_size] containing an
+     *      activation state from the last time step in the sequence. This
+     *      output is optional and can be omitted. If this output is present
+     *      then outputs 2, 3, 5 must be present as well.
+     *      Available since API level 30.
+     * * 5: The backward cell state output.
+     *      A tensor of shape [batch_size, bw_cell_size] containing a cell state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted. If this output is present
+     *      then outputs 2-4 must be present as well.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_LSTM = 42,
 
@@ -2662,8 +2922,8 @@ typedef enum {
      * * “activation” is the function passed as the “fused_activation_function”
      *   argument (if not “NONE”).
      *
-     * The op also supports an auxiliary input. Regular cell feeds one input
-     * into the two RNN cells in the following way:
+     * The op supports cross-linking via an auxiliary input. Regular cell feeds
+     * one input into the two RNN cells in the following way:
      *
      *       INPUT  (INPUT_REVERSED)
      *         |         |
@@ -2673,8 +2933,8 @@ typedef enum {
      *         |         |
      *      FW_OUT     BW_OUT
      *
-     * An op with an auxiliary input takes two inputs and feeds them into the
-     * RNN cells in the following way:
+     * An op with cross-linking takes two inputs and feeds them into the RNN
+     * cells in the following way:
      *
      *       AUX_INPUT   (AUX_INPUT_REVERSED)
      *           |             |
@@ -2687,9 +2947,26 @@ typedef enum {
      *         |           |
      *      FW_OUT      BW_OUT
      *
+     * The cross-linking mode is enabled iff auxiliary input and auxiliary
+     * weights are present. While stacking this op on top of itself, this
+     * allows to connect both forward and backward outputs from previous cell
+     * to the next cell's input.
+     *
+     * Since API level 30 parallel linking mode is supported. The mode is
+     * enabled if auxiliary input is present but auxiliary weights are omitted.
+     * In this case, the cell feeds inputs into the RNN in the following way:
+     *
+     *       INPUT (AUX_INPUT_REVERSED)
+     *         |         |
+     *    ---------------------
+     *    | FW_RNN     BW_RNN |
+     *    ---------------------
+     *         |         |
+     *      FW_OUT     BW_OUT
+     *
      * While stacking this op on top of itself, this allows to connect both
      * forward and backward outputs from previous cell to the next cell's
-     * inputs.
+     * corresponding inputs.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
@@ -2722,11 +2999,17 @@ typedef enum {
      *      A 2-D tensor of shape [batchSize, bwNumUnits]. Specifies a hidden
      *      state input for the first time step of the computation.
      * * 9: auxInput.
-     *      A 3-D tensor. The shape is the same as of the input 0.
+     *      A 3-D tensor. The shape is defined by the input 6 (timeMajor). If
+     *      it is set to true, then the input has a shape [maxTime, batchSize,
+     *      auxInputSize], otherwise the input has a shape [batchSize, maxTime,
+     *      auxInputSize]. Can be omitted. See the docs above for the usage
+     *      modes explanation.
      * * 10:fwAuxWeights.
-     *      A 2-D tensor of shape [fwNumUnits, inputSize].
+     *      A 2-D tensor of shape [fwNumUnits, auxInputSize]. Can be omitted.
+     *      See the docs above for the usage modes explanation.
      * * 11:bwAuxWeights.
-     *      A 2-D tensor of shape [bwNumUnits, inputSize].
+     *      A 2-D tensor of shape [bwNumUnits, auxInputSize]. Can be omitted.
+     *      See the docs above for the usage modes explanation.
      * * 12:fusedActivationFunction.
      *      A {@link FuseCode} value indicating the activation function. If
      *      “NONE” is specified then it results in a linear activation.
@@ -2752,8 +3035,24 @@ typedef enum {
      *      (timeMajor). If it is set to true, then the shape is set to
      *      [maxTime, batchSize, bwNumUnits], otherwise the shape is set to
      *      [batchSize, maxTime, bwNumUnits].
+     * * 2: The forward hidden state output.
+     *      A 2-D tensor of shape [batchSize, fwNumUnits] containing a hidden
+     *      state from the last time step in the sequence. This output is
+     *      optional and can be omitted. If this output is present then output
+     *      3 must be present as well.
+     *      Available since API level 30.
+     * * 3: The backward hidden state output.
+     *      A 2-D tensor of shape [batchSize, bwNumUnits] containing a hidden
+     *      state from the last time step in the sequence. This output is
+     *      optional and can be omitted. If this output is present then output
+     *      2 must be present as well.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_RNN = 43,
 
@@ -2780,6 +3079,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Inputs:
      * * 0: A 2-D Tensor of shape [num_rois, num_classes], specifying the score
@@ -2791,7 +3091,11 @@ typedef enum {
      *      order of the boxes corresponds with input0. For input0 of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should be of
      *      {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with zeroPoint of 0 and
-     *      scale of 0.125. Zero num_rois is supported for this tensor.
+     *      scale of 0.125.
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      this tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM},
+     *      with zeroPoint of -128 and scale of 0.125.
+     *      Zero num_rois is supported for this tensor.
      * * 2: A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape
      *      [num_rois], specifying the batch index of each box. Boxes with
      *      the same batch index are grouped together.
@@ -2818,6 +3122,8 @@ typedef enum {
      *      [num_output_rois], specifying the score of each output box. The boxes
      *      are grouped by batches, but the sequential order in each batch is not
      *      guaranteed. For type of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      guaranteed. For type of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      or {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the scale and zero point must be the same as input0.
      * * 1: A 2-D Tensor of the same {@link OperandCode} as input1, with shape
      *      [num_output_rois, 4], specifying the coordinates of each
@@ -2837,7 +3143,7 @@ typedef enum {
     ANEURALNETWORKS_BOX_WITH_NMS_LIMIT = 44,
 
     /**
-     * Casts a tensor to a new type.
+     * Casts a tensor to a type.
      *
      * This operation ignores the scale and zeroPoint of quanized tensors,
      * e.g. it treats a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} input
@@ -2848,6 +3154,14 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * Since API level 30, casting tensors of the following
+     * {@link OperandCode} to the same {@link OperandCode} is supported:
+     * * {@link ANEURALNETWORKS_TENSOR_BOOL8}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
      *
      * Supported tensor rank: from 1
      *
@@ -2880,6 +3194,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2894,7 +3209,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} and same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -2952,14 +3268,14 @@ typedef enum {
      * * 11: A scalar, score_threshold. Boxes with scores lower than the
      *       threshold are filtered before sending to the NMS algorithm. The
      *       scalar must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of
-     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *       ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *       {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 12: A scalar, specifying the IoU threshold for hard NMS. The scalar
-     *       must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *       ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *       must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *       {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 13: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to include
      *       background class in the list of label map for the output, set
      *       to false to not include the background. When the background
@@ -2992,6 +3308,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3041,6 +3358,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3052,7 +3370,8 @@ typedef enum {
      * Outputs:
      * * 0: An (n + 1)-D tensor with the same {@link OperandCode} and data as
      *      input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -3078,6 +3397,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3092,7 +3412,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: An (n + k - 1)-D tensor with the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -3115,6 +3436,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Inputs:
      * * 0: A 4-D Tensor specifying the score of each anchor at each
@@ -3132,11 +3454,13 @@ typedef enum {
      *      dimensions is the channel dimension.
      * * 2: A 2-D Tensor of shape [num_anchors, 4], specifying the shape of each
      *      predefined anchor, with format [x1, y1, x2, y2]. For input0 of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should be of
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this tensor should be of
      *      {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}, with scale of 0.125.
      * * 3: A 2-D Tensor of shape [batches, 2], specifying the size of
      *      each image in the batch, with format [image_height, image_width].
-     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this
      *      tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}, with
      *      scale of 0.125.
      * * 4: An {@link ANEURALNETWORKS_FLOAT32} scalar, specifying the ratio
@@ -3163,7 +3487,8 @@ typedef enum {
      *      [num_output_rois], specifying the score of each output box.
      *      The boxes are grouped by batches, but the sequential order in
      *      each batch is not guaranteed. For type of
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the scale and zero
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, the scale and zero
      *      point must be the same as input0.
      * * 1: A tensor of the same {@link OperandCode} as input3, of shape
      *      [num_output_rois, 4], specifying the coordinates of each output
@@ -3188,6 +3513,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3213,6 +3539,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3271,12 +3598,23 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
      * * * input.scale * filter.scale).
      *
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
      * * Quantized with symmetric per channel quantization for the filter:
      * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} for input, and output.
      * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
@@ -3295,8 +3633,9 @@ typedef enum {
      *      {@link ANeuralNetworksSymmPerChannelQuantParams}) must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
      *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
      *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
@@ -3316,7 +3655,7 @@ typedef enum {
      * * 8: An {@link ANEURALNETWORKS_INT32} scalar, specifying the stride when
      *      walking through input in the ‘height’ dimension.
      * * 9: An {@link ANEURALNETWORKS_INT32} scalar, specifying the number of
-            groups.
+     *      groups.
      * * 10: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *       {@link FuseCode} values. Specifies the activation to
      *       invoke on the result.
@@ -3330,12 +3669,14 @@ typedef enum {
      *      [depth_out, filter_height, filter_width, depth_group], specifying
      *      the filter, where depth_out must be divisible by num_groups.  For
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
-     *      the channel dimension (channelDim at
-     *      {@link ANeuralNetworksSymmPerChannelQuantParams}) must be set to 0.
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
      *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
      *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
      *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
@@ -3360,7 +3701,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth_out].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -3382,6 +3724,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -3398,13 +3741,18 @@ typedef enum {
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should
      *      be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with zeroPoint
      *      of 0 and scale of 0.125.
+     *      For input0 of type
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this tensor
+     *      should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with
+     *      zeroPoint of -128 and scale of 0.125.
      * * 2: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to specify
      *      NCHW data layout for input0. Set to false for NHWC.
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0, with shape
      *      [num_boxes, num_keypoints], specifying score of the keypoints.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from input0 scale and zeroPoint.
      * * 1: A tensor of the same {@link OperandCode} as input1, with shape
      *      [num_boxes, num_keypoints, 2], specifying the location of
@@ -3447,19 +3795,19 @@ typedef enum {
      * * 0: An n-D tensor, specifying the tensor to be normalized.
      * * 1: A scalar, specifying gamma, the scale applied to the normalized
      *      tensor. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if
-     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *      ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *      {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 2: A scalar, specifying beta, the offset applied to the normalized
      *      tensor. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if
-     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *      ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *      {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 3: A scalar, specifying epsilon, the small value added to variance to
      *      avoid dividing by zero. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if
-     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *      ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *      {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 4: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to specify
      *      NCHW data layout for input0 and output0. Set to false for NHWC.
      *
@@ -3479,6 +3827,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3505,6 +3854,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3644,6 +3994,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1.
      *
@@ -3656,7 +4007,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -3671,6 +4023,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1.
      *
@@ -3683,7 +4036,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -3719,6 +4073,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3744,6 +4099,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -3761,7 +4117,8 @@ typedef enum {
      *      pad value must be of {@link ANEURALNETWORKS_FLOAT16}.
      *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, the
      *      pad value must be of {@link ANEURALNETWORKS_FLOAT32}.
-     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the pad value must be of {@link ANEURALNETWORKS_INT32}. The
      *      scale and zeroPoint are assumed to be the same as in input0.
      *
@@ -3773,7 +4130,8 @@ typedef enum {
      *      of the padding:
      *          output0.dimension[i] =
      *              padding[i, 0] + input0.dimension[i] + padding[i, 1]
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -3836,6 +4194,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3846,8 +4205,9 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
-     *      the scale and zeroPoint can be diffent from the input0 scale and zeroPoint.
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
+     *      the scales and zeroPoint can be different from input0 scale and zeroPoint.
      *
      * Available since API level 29.
      */
@@ -3856,14 +4216,23 @@ typedef enum {
     /**
      * Quantizes the input tensor.
      *
-     * The formula is:
+     * The formula for {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} output tensor is:
      *
      *     output = max(0, min(255, round(input / scale) + zeroPoint)
      *
-     * Supported tensor {@link OperandCode}:
+     * The formula for {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} output
+     * tensor is:
+     *
+     *     output = max(-128, min(127, round(input / scale) + zeroPoint)
+     *
+     * Supported input tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      *
+     * Supported output tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     *
      * Supported tensor rank: from 1
      *
      * Inputs:
@@ -3871,7 +4240,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0, but with
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}.
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or.
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}.
      *
      * Available since API level 29.
      */
@@ -3995,7 +4365,8 @@ typedef enum {
      * * 1: A scalar {@link ANEURALNETWORKS_INT32}, specifying the number of
      *      independent samples to draw for each row slice.
      * * 2: A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape [2],
-     *      specifying seeds used to initialize the random distribution.
+     *      specifying seeds used to initialize the random distribution. If both
+     *      provided seeds are 0, both will be randomly generated.
      * Outputs:
      * * 0: A 2-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape
      *      [batches, samples], containing the drawn samples.
@@ -4026,6 +4397,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4053,6 +4426,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4070,6 +4445,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -4082,7 +4458,10 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4101,6 +4480,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -4113,7 +4493,10 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4142,6 +4525,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4169,6 +4554,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4188,9 +4575,10 @@ typedef enum {
      * interpolation.
      *
      * Supported tensor {@link OperandCode}:
-     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -4229,7 +4617,8 @@ typedef enum {
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0. The output
      *      shape is [num_rois, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from the input0 scale and zeroPoint.
      *
      * Available since API level 29.
@@ -4252,6 +4641,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -4262,7 +4652,8 @@ typedef enum {
      * * 0: A 4-D tensor, specifying the feature map.
      * * 1: A 2-D Tensor of shape [num_rois, 4], specifying the locations of
      *      the regions of interest, each line with format [x1, y1, x2, y2].
-     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      this tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM},
      *      with zeroPoint of 0 and scale of 0.125.
      * * 2: An 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape
@@ -4282,7 +4673,8 @@ typedef enum {
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0. The output
      *      shape is [num_rois, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4319,6 +4711,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4329,7 +4722,8 @@ typedef enum {
      *      true) or input2 (if false).
      * * 1: An input tensor of the same shape as input0.
      * * 2: An input tensor of the same shape and type as input1.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scales and zeroPoint can be different from input1 scale and zeroPoint.
      *
      * Outputs:
@@ -4337,6 +4731,7 @@ typedef enum {
      *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
+     * Available since API level 29.
      */
     ANEURALNETWORKS_SELECT = 84,
 
@@ -4376,6 +4771,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4388,7 +4784,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: An n-D tensor of the same type as the input containing the slice.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      its scale and zeroPoint has to be same as the input0 scale and zeroPoint.
      *
      * Available since API level 29.
@@ -4403,6 +4800,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4415,7 +4813,8 @@ typedef enum {
      *
      * Outputs:
      * * 0 ~ (num_splits - 1): Resulting subtensors.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4455,6 +4854,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4465,7 +4865,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tiled tensor of the same {@link OperandCode} and rank as `input`.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4483,6 +4884,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4494,7 +4896,8 @@ typedef enum {
      * Outputs:
      * * 0: An n-D tensor of the same type as the input, containing the k
      *      largest elements along each last dimensional slice.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      * * 1: An n-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32}
      *      containing the indices of values within the last dimension of input.
@@ -4531,6 +4934,18 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * Available since API level 30:
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
@@ -4540,24 +4955,25 @@ typedef enum {
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
      *      filter. For tensor of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias should be of the
-     *      same type. For input tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be
-     *      of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
-     *      bias_scale == input_scale * filter_scale. For filter tensor of
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal
-     *      to bias_scale[i] = input_scale * filter_scale[i].
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the
+     *      same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      with zeroPoint of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 4: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -4578,24 +4994,25 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
      *      filter. For tensor of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
      *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias should be of the
-     *      same type. For input tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be
-     *      of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
-     *      bias_scale == input_scale * filter_scale. For filter tensor of
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal
-     *      to bias_scale[i] = input_scale * filter_scale[i].
+     *      same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      with zeroPoint of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_TENSOR_INT32} tensor, specifying the output
      *      tensor shape.
      * * 4: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
@@ -4614,7 +5031,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth_out].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -4727,8 +5145,21 @@ typedef enum {
      *      A 3-D tensor of shape:
      *        If time-major: [max_time, batch_size, output_size]
      *        If batch-major: [batch_size, max_time, output_size]
+     * * 1: A tensor of shape [batch_size, output_size] containing a hidden
+     *      state from the last time step in the sequence. This output is
+     *      optional and can be omitted. If this output is present then
+     *      output #2 must be present as well.
+     *      Available since API level 30.
+     * * 2: A tensor of shape [batch_size, cell_size] containing a cell state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
 
@@ -4784,8 +5215,16 @@ typedef enum {
      *      it is set to 1, then the output has a shape [maxTime, batchSize,
      *      numUnits], otherwise the output has a shape [batchSize, maxTime,
      *      numUnits].
+     * * 1: A tensor of shape [batchSize, numUnits] containing hidden state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
 
@@ -4800,6 +5239,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -4817,6 +5257,17 @@ typedef enum {
      *      height of the output tensor.
      * * 3: An {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Inputs (resizing by scale):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
@@ -4835,16 +5286,377 @@ typedef enum {
      *      {@link ANEURALNETWORKS_FLOAT32} otherwise.
      * * 3: An {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, new_height, new_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
      */
     ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+
+    // Operations below are available since API level 30.
+
+    /**
+     * Quantized version of {@link ANEURALNETWORKS_LSTM}.
+     *
+     * The input and the output use asymmetric quantized types, while the rest
+     * use symmetric ones.
+     *
+     * Inputs:
+     * * 0: The input to the LSTM cell.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *      Shape: [batchSize, inputSize]
+     * * 1: The input-to-input weights. Optional.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 2: The input-to-forget weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 3: The input-to-cell weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 4: The input-to-output weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 5: The recurrent-to-input weights. Optional.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 6: The recurrent-to-forget weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 7: The recurrent-to-cell weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 8: The recurrent-to-output weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 9: The cell-to-input weights (for peephole). Optional.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *      Shape: [numUnits]
+     * * 10: The cell-to-forget weights (for peephole). Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 11: The cell-to-output weights (for peephole). Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 12: The input gate bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 13: The forget gate bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 14: The cell bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 15: The output gate bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 16: The projection weights. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *       Shape: [outputSize, numUnits]
+     * * 17: The projection bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [outputSize]
+     * * 18: The output from the previous time step.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *       Shape: [batchSize, outputSize]
+     * * 19: The cell state from the previous time step.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [batchSize, numUnits]
+     * * 20: The input layer normalization weights. Used to rescale
+     *       normalized inputs to activation at input gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 21: The forget layer normalization weights. Used to
+     *       rescale normalized inputs to activation at forget gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 22: The cell layer normalization weights. Used to rescale
+     *       normalized inputs to activation at cell gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 23: The output layer normalization weights. Used to
+     *       rescale normalized inputs to activation at output gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 24: The cell clip. If provided the cell state is clipped
+     *       by this value prior to the cell output activation. Optional.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 25: The projection clip. If provided and projection is enabled,
+     *       this is used for clipping the projected values. Optional.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 26: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at input gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 27: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at forget gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 28: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at cell gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 29: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at output gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 30: The zero point of the hidden state, i.e. input to
+     *       projection.
+     *       Type: {@link ANEURALNETWORKS_INT32}.
+     * * 31: The scale of the hidden state, i.e. input to
+     *       projection.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     *
+     * Outputs:
+     * * 0: The output state (out).
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *      Shape: [batchSize, outputSize]
+     * * 1: The cell state (out).
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *      Shape: [batchSize, numUnits]
+     * * 2: The output. This is effectively the same as the current
+     *      "output state (out)" value.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *      Shape: [batchSize, outputSize]
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+
+    /**
+     * Executes one of the two referenced models as determined by a boolean
+     * value.
+     *
+     * The inputs and outputs of the two referenced models must agree with the
+     * signature of this operation. That is, if the operation has (3 + n) inputs
+     * and m outputs, both models must have n inputs and m outputs with the same
+     * types, ranks (if specified), dimensions (if specified), scales,
+     * zeroPoints, and other operand parameters as the corresponding operation
+     * inputs and outputs.
+     *
+     * Inputs:
+     * * 0: A value of type {@link ANEURALNETWORKS_TENSOR_BOOL8} and shape [1]
+     *      that determines which of the two referenced models to execute.
+     *      The operand must have fully specified dimensions.
+     * * 1: A {@link ANEURALNETWORKS_MODEL} reference to the model to be
+     *      executed if the condition is true.
+     * * 2: A {@link ANEURALNETWORKS_MODEL} reference to the model to be
+     *      executed if the condition is false.
+     * * 3 ~ (n + 2): Inputs to be passed to the model selected for execution.
+     *
+     * Outputs:
+     * * 0 ~ (m - 1): Outputs produced by the selected model.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_IF = 96,
+
+    /**
+     * Executes the body model until the condition model outputs false.
+     *
+     * The inputs to this operation are the condition model, the body model,
+     * and operand values for the first iteration of the loop. The values are
+     * implicitly split into three groups of input-output, state-only, and
+     * input-only values, as described below.
+     *
+     * The outputs of this operation are the final values of input-output
+     * operands.
+     *
+     * Both the condition and body model receive (m + k + n) inputs.
+     * * The first m (m >= 1) inputs are input-output operands. For the first
+     *   iteration, these are initialized from the corresponding inputs of the
+     *   WHILE operation. In subsequent iterations, their values come from the
+     *   corresponding outputs of the body model produced during the previous
+     *   iteration.
+     * * The next k (k >= 0) inputs are state-only operands. They are similar to
+     *   the input-output operands, except that their values are no longer
+     *   available after the loop terminates.
+     * * The last n (n >= 0) inputs are input-only operands. Their values come
+     *   from the corresponding inputs of the WHILE operation.
+     *
+     * The body model produces (m + k) outputs.
+     * * The first m outputs are input-output operands. They become the outputs
+     *   of the WHILE operation when a termination condition is reached.
+     * * The last k outputs are state-only operands. Their values are no longer
+     *   available after the loop terminates.
+     *
+     * The numbers m, k, and n are inferred by the runtime as follows:
+     *     m = (WHILE operation output count)
+     *     k = (body model output count) - m
+     *     n = (body model input count) - m - k
+     *
+     * The pseudo-code below illustrates the flow of a WHILE operation with
+     * inputs condition, body, initial_input_output, initial_state, input_only
+     * (m = 1, k = 1, n = 1):
+     *
+     *     input_output = initial_input_output
+     *     state = initial_state
+     *     while condition(input_output, state, input_only):
+     *         input_output, state = body(input_output, state, input_only)
+     *     return input_output
+     *
+     * To prevent infinite loops, there is an implicit execution timeout
+     * associated with each loop ("loop timeout duration"). See {@link
+     * ANeuralNetworksExecution_setLoopTimeout}.
+     *
+     * Inputs:
+     * * 0: A {@link ANEURALNETWORKS_MODEL} reference to the condition
+     *      model. The model must have (m + k + n) inputs with
+     *      the same types, ranks (if specified), dimensions (if specified),
+     *      scales, zeroPoints, and other operand parameters as the
+     *      corresponding inputs of the WHILE operation and exactly one output
+     *      of {@link ANEURALNETWORKS_TENSOR_BOOL8} and shape [1].
+     *      The output operand must have fully specified dimensions.
+     * * 1: A {@link ANEURALNETWORKS_MODEL} reference to the body model.
+     *      The model must have (m + k + n) inputs and (m + k) outputs with
+     *      the same types, ranks (if specified), dimensions (if specified),
+     *      scales, zeroPoints, and other operand parameters as the
+     *      corresponding inputs and outputs of the WHILE operation.
+     * * (m inputs): Initial values for input-output operands.
+     * * (k inputs): Initial values for state-only operands.
+     * * (n inputs): Values for input-only operands.
+     *
+     * Outputs:
+     * * 0 ~ (m - 1): Outputs produced by the loop.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_WHILE = 97,
+
+    /**
+     * Computes exponential linear activation on the input tensor element-wise.
+     *
+     * The output is calculated using the following formula:
+     *
+     *     ELU(x) = max(0, x) + min(0, alpha * (exp(x) - 1))
+     *
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input. May be zero-sized.
+     * * 1: A scalar, specifying the alpha parameter.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT16},
+     *      the alpha value must be of {@link ANEURALNETWORKS_FLOAT16}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+     *      the alpha value must be of {@link ANEURALNETWORKS_FLOAT32}.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape and type as input0.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_ELU = 98,
+
+    /**
+     * Computes hard-swish activation on the input tensor element-wise.
+     *
+     * Hard swish activation is introduced in
+     * https://arxiv.org/pdf/1905.02244.pdf
+     *
+     * The output is calculated using the following formula:
+     *
+     *     h-swish(x) = x * max(0, min(6, (x + 3))) / 6
+
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input. May be zero-sized.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape and type as input0.
+     *      Scale and zero point of this tensor may be different from the input
+     *      tensor's parameters.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_HARD_SWISH = 99,
+
+    /**
+     * Creates a tensor filled with a scalar value.
+     *
+     * Supported output tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: A 1-D tensor, specifying the desired output tensor shape.
+     * * 1: A scalar, specifying the value to fill the output tensors with.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT16},
+     *      the scalar must be of {@link ANEURALNETWORKS_FLOAT16}.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+     *      the scalar must be of {@link ANEURALNETWORKS_FLOAT32}.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      the scalar must be of {@link ANEURALNETWORKS_INT32}.
+     *
+     * Outputs:
+     * * 0: The output tensor.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_FILL = 100,
+
+    /**
+     * Returns the rank of a tensor.
+     *
+     * The rank of a tensor is the number of dimensions in it. Also known as
+     * "order", "degree", "ndims".
+     *
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_BOOL8}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: The input tensor.
+     *
+     * Outputs:
+     * * 0: A scalar of {@link ANEURALNETWORKS_INT32}, specifying the rank
+     *      of the input tensor.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_RANK = 101,
 } OperationCode;
 
 /**
@@ -4880,10 +5692,11 @@ typedef enum {
      * the same; for odd number of padding, padding to the ending is bigger
      * than the padding to the beginning by 1.
      *
-     * total_padding is a function of input, stride and filter size.
+     * total_padding is a function of input, stride, dilation and filter size.
      * It could be computed as follows:
-     *    out_size = (input + stride - 1) / stride;
-     *    needed_input = (out_size - 1) * stride + filter_size
+     *    out_size = (input + stride - 1) / stride
+     *    effective_filter_size = (filter_size - 1) * dilation + 1
+     *    needed_input = (out_size - 1) * stride + effective_filter_size
      *    total_padding = max(0, needed_input - input_size)
      *  The computation is the same for the horizontal and vertical directions.
      */
@@ -5004,6 +5817,47 @@ typedef enum {
      * Failure caused by a device not being available.
      */
     ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
+
+    /**
+     * Failure because a deadline could not be met for a task, but future
+     * deadlines may still be met for the same task after a short delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT = 10,
+
+    /**
+     * Failure because a deadline could not be met for a task, and future
+     * deadlines will likely also not be met for the same task even after a
+     * short delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT = 11,
+
+    /**
+     * Failure because of a resource limitation within the driver, but future
+     * calls for the same task may still succeed after a short delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT = 12,
+
+    /**
+     * Failure because of a resource limitation within the driver, and future
+     * calls for the same task will likely also fail even after a short
+     * delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT = 13,
+
+    /**
+     * Failure indicating an object is in a dead state.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_DEAD_OBJECT = 14,
 } ResultCode;
 
 /**
@@ -5024,6 +5878,48 @@ enum { ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 };
 enum { ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN = 32 };
 
 /**
+ * Different duration measurements.
+ *
+ * Durations are measured in nanoseconds.
+ *
+ * Available since API level 29.
+ */
+typedef enum {
+    // Execution time on hardware (not driver, which runs on host processor).
+    ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
+    // Execution time in driver (including time on hardware).  Excludes overhead
+    // such as that of the runtime itself and the IPC needed for the runtime to
+    // communicate with the driver.
+    ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
+    // Execution time on hardware, after all dependencies have been signaled.
+    // If no dependencies specified (for example, if the execution was scheduled other
+    // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}), the
+    // reported time will be the same as ANEURALNETWORKS_DURATION_ON_HARDWARE.
+    // Available since API level 30.
+    ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE = 2,
+    // Execution time in driver, after all dependencies have been signaled. Excludes
+    // overhead such as that of the runtime itself and the IPC needed for the runtime
+    // to communicate with the driver.
+    // If no dependencies specified (for example, if the execution was scheduled other
+    // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}), the
+    // reported time will be the same as ANEURALNETWORKS_DURATION_IN_DRIVER.
+    // Available since API level 30.
+    ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER = 3,
+} DurationCode;
+
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+typedef enum {
+    ANEURALNETWORKS_PRIORITY_LOW = 90,
+    ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+    ANEURALNETWORKS_PRIORITY_HIGH = 110,
+    ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+} PriorityCode;
+
+/**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
  * This type is used to represent shared memory, memory mapped files,
@@ -5049,7 +5945,21 @@ enum { ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN = 32 };
  * of the element type byte size, e.g., a tensor with
  * {@link ANEURALNETWORKS_TENSOR_FLOAT32} type must be aligned on 4-byte boundary.
  *
+ * It is the application's responsibility to ensure that there are no uses of
+ * the memory after calling {@link ANeuralNetworksMemory_free}. This includes
+ * any model which references this memory because of a call to
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}, any compilation
+ * created using such a model, any execution object or burst object created
+ * using such a compilation, or any execution which references this memory
+ * because of a call to {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ *
  * Available since API level 27.
+ *
+ * Starting at API level 30, the application may request creation of device native memory from
+ * {@link ANeuralNetworksMemoryDesc} to avoid potential memory copying and transformation
+ * overhead between executions. See also {@link ANeuralNetworksMemoryDesc} and
+ * {@link ANeuralNetworksMemory_createFromDesc}.
  */
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
 
@@ -5079,9 +5989,10 @@ typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
  * modifies a model at a given time. It is however safe for more than one
  * thread to use the model once {@link ANeuralNetworksModel_finish} has returned.</p>
  *
- * <p>It is also the application's responsibility to ensure that there are no other
- * uses of the model after calling {@link ANeuralNetworksModel_free}.
- * This includes any compilation or execution object created using the model.</p>
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the model after calling {@link ANeuralNetworksModel_free}.
+ * This includes any compilation, execution object or burst object created using
+ * the model.</p>
  *
  * Available since API level 27.
  */
@@ -5119,7 +6030,10 @@ typedef struct ANeuralNetworksModel ANeuralNetworksModel;
  *
  * <p>It is also the application's responsibility to ensure that there are no other
  * uses of the compilation after calling {@link ANeuralNetworksCompilation_free}.
- * This includes any execution object created using the compilation.</p>
+ * This includes any execution object or burst object created using the compilation,
+ * or any memory descriptor with the compilation as part of one of the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} or
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}.</p>
  *
  * Available since API level 27.
  */
@@ -5139,7 +6053,8 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
  *        {@link ANeuralNetworksExecution_setOutput} or
  *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
  *    <li>Apply the model with one of the following:</li><ul>
- *        <li>Asynchronously with {@link ANeuralNetworksExecution_startCompute},
+ *        <li>Asynchronously with {@link ANeuralNetworksExecution_startCompute}
+ *            or with {@link ANeuralNetworksExecution_startComputeWithDependencies},
  *            waiting for the execution to complete with
  *            {@link ANeuralNetworksEvent_wait}.</li>
  *        <li>Synchronously with {@link ANeuralNetworksExecution_compute}.</li>
@@ -5154,38 +6069,54 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
  * ({@link ANeuralNetworksModel_setOperandValueFromMemory}).</p>
  *
  * <p>An execution cannot be modified once
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} has been called on it.</p>
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} has been called on it.</p>
  *
  * <p>An execution can be applied to a model with
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} only once. Create new
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} only once. Create new
  * executions to do new evaluations of the model.</p>
  *
  * <p>It is the application's responsibility to make sure that only one thread
  * modifies an execution at a given time. It is however safe for more than one
  * thread to use {@link ANeuralNetworksEvent_wait} at the same time.</p>
  *
+ * <p>It is also the application's responsibility to ensure that the execution
+ * either has never been scheduled or has completed (i.e., that
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute}, or
+ * {@link ANeuralNetworksEvent_wait} has returned) before calling
+ * {@link ANeuralNetworksExecution_free}.</p>.
+ *
  * <p>It is also the application's responsibility to ensure that there are no other
  * uses of the execution after calling {@link ANeuralNetworksExecution_free}.</p>
  *
  * <p>Multiple executions can be scheduled and evaluated concurrently, either by
- * means of {@link ANeuralNetworksExecution_compute} (which is synchronous) in
- * different threads or by means of
- * {@link ANeuralNetworksExecution_startCompute} (which is asynchronous). The
- * runtime makes no guarantee on the ordering of completion of executions. If
- * it's important to the application, the application should enforce the
- * ordering by ensuring that one execution completes before the next is
- * scheduled (for example, by scheduling all executions synchronously within a
- * single thread, or by scheduling all executions asynchronously and using
- * {@link ANeuralNetworksEvent_wait} between calls to
- * {@link ANeuralNetworksExecution_startCompute}).</p>
+ * means of {@link ANeuralNetworksExecution_compute} or
+ * {@link ANeuralNetworksExecution_burstCompute} (which are synchronous) in
+ * different threads, or by means of
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} (which are asynchronous).
+ * (Concurrent uses of {@link ANeuralNetworksExecution_burstCompute} must be on
+ * different burst objects.) The runtime makes no guarantee on the ordering of
+ * completion of executions. If it's important to the application, the
+ * application should enforce the ordering by ensuring that one execution
+ * completes before the next is scheduled (for example, by scheduling all
+ * executions synchronously within a single thread, or by scheduling all
+ * executions asynchronously and using {@link ANeuralNetworksEvent_wait} between
+ * calls to {@link ANeuralNetworksExecution_startCompute}); or by using
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} to make the execution wait for a
+ * list of events to be signaled before starting the actual evaluation.</p>
  *
  * Available since API level 27.
  */
 typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 /**
  * Parameters for ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
  */
@@ -5230,7 +6161,7 @@ typedef struct ANeuralNetworksSymmPerChannelQuantParams {
  * Available since API level 29.
  */
 typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
-#endif  //  __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  //  __ANDROID_API__ >= 29
 
 /**
  * ANeuralNetworksOperandType describes the type of an operand.
@@ -5245,7 +6176,9 @@ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
  *
  * If a tensor operand's type is not fully specified, the dimensions
  * of the operand are deduced from the operand types and values of the
- * operation for which that operand is an output.
+ * operation for which that operand is an output or from the corresponding
+ * {@link ANEURALNETWORKS_IF} or {@link ANEURALNETWORKS_WHILE} operation input
+ * operand type in the case of referenced model input operands.
  *
  * <p>In the following situations, a tensor operand type must be fully
  * specified:<ul>
@@ -5254,16 +6187,25 @@ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
  *         non-nullptr buffer) or
  *         {@link ANeuralNetworksModel_setOperandValueFromMemory}.</li>
  *     <li>The operand is a model input (see
- *         {@link ANeuralNetworksModel_identifyInputsAndOutputs}).  A
- *         fully specified tensor operand type must either be provided
- *         to {@link ANeuralNetworksModel_addOperand}; or it must be
- *         provided to the corresponding
+ *         {@link ANeuralNetworksModel_identifyInputsAndOutputs}) of the main
+ *         model within a compilation.  A fully specified tensor operand type
+ *         must either be provided to {@link ANeuralNetworksModel_addOperand};
+ *         or it must be provided to the corresponding
  *         {@link ANeuralNetworksExecution_setInput}, or
  *         {@link ANeuralNetworksExecution_setInputFromMemory}.
  *         EXCEPTION: If the input is optional and omitted
  *         (by passing nullptr for buffer to
  *         {@link ANeuralNetworksExecution_setInput}) then it need
- *         not have a fully specified tensor operand type.</li></ul>
+ *         not have a fully specified tensor operand type.</li>
+ *     <li>The operand is a model output (see
+ *         {@link ANeuralNetworksModel_identifyInputsAndOutputs}) of the main
+ *         model within a compilation and is to be used with {@link
+ *         ANeuralNetworksExecution_startComputeWithDependencies}.
+ *         A fully specified tensor operand type must either be provided
+ *         to {@link ANeuralNetworksModel_addOperand}; or it must be
+ *         provided to the corresponding
+ *         {@link ANeuralNetworksExecution_setOutput}, or
+ *         {@link ANeuralNetworksExecution_setOutputFromMemory}.</li></ul>
  *
  * A tensor operand type of specified rank but some number of
  * unspecified dimensions is represented by setting dimensionCount to
@@ -5296,11 +6238,21 @@ typedef struct ANeuralNetworksOperandType {
     const uint32_t* dimensions;
 
     /**
-     * These two fields are only used for quantized tensors.
-     * They must be zero for all other types.
-     * The dequantized value of each entry is (value - zeroPoint) * scale.
+     * The quantization scale.
+     *
+     * Must be 0 when not applicable to an operand type.
+     *
+     * See {@link OperandCode}.
      */
     float scale;
+
+    /**
+     * The quantization zero point.
+     *
+     * Must be 0 when not applicable to an operand type.
+     *
+     * See {@link OperandCode}.
+     */
     int32_t zeroPoint;
 } ANeuralNetworksOperandType;
 
@@ -5314,7 +6266,7 @@ typedef int32_t ANeuralNetworksOperationType;
  */
 typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 
 /**
  * ANeuralNetworksDevice is an opaque type that represents a device.
@@ -5326,6 +6278,318 @@ typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
  */
 typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
 
+#endif  // __ANDROID_API__ >= 29
+
+#if __ANDROID_API__ >= 30
+
+/**
+ * ANeuralNetworksMemoryDesc is an opaque type that represents a memory descriptor.
+ *
+ * A memory descriptor describes the properties of a memory object, and is used by
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * To use:
+ *   - Create a new memory descriptor by calling {@link ANeuralNetworksMemoryDesc_create}.
+ *   - Specify all of the intended input and output roles by calling
+ *     {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ *     {@link ANeuralNetworksMemoryDesc_addOutputRole}.
+ *   - Optionally, specify the memory dimensions by calling
+ *     {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *   - Complete the memory descriptor with {@link ANeuralNetworksMemoryDesc_finish}.
+ *   - Use the memory descriptor as many times as needed with
+ *     {@link ANeuralNetworksMemory_createFromDesc}.
+ *   - Destroy the memory descriptor with {@link ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor is completed by calling {@link ANeuralNetworksMemoryDesc_finish}.
+ * A memory descriptor is destroyed by calling {@link ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor must not be modified once {@link ANeuralNetworksMemoryDesc_finish}
+ * has been called on it.
+ *
+ * It is the application's responsibility to make sure that only
+ * one thread modifies a memory descriptor at a given time. It is however
+ * safe for more than one thread to use the memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has returned.
+ *
+ * It is also the application's responsibility to ensure that there are no other
+ * uses of the memory descriptor after calling {@link ANeuralNetworksMemoryDesc_free}.
+ * It is however safe to continue using a {@link ANeuralNetworksMemory} object created
+ * from the memory descriptor.
+ *
+ * Available since API level 30.
+ */
+typedef struct ANeuralNetworksMemoryDesc ANeuralNetworksMemoryDesc;
+
+/**
+ * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
+ *
+ * This only creates the memory descriptor. Its properties should be set with calls to
+ * {@link ANeuralNetworksMemoryDesc_addInputRole},
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and
+ * {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *
+ * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties have been set.
+ *
+ * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory descriptor
+ * is no longer needed.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The {@link ANeuralNetworksMemoryDesc} to be created.
+ *             Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_create(ANeuralNetworksMemoryDesc** desc) __INTRODUCED_IN(30);
+
+/**
+ * Destroy a memory descriptor.
+ *
+ * The memory descriptor need not have been finished by a call to
+ * {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be destroyed. Passing NULL is acceptable and
+ *             results in no operation.
+ */
+void ANeuralNetworksMemoryDesc_free(ANeuralNetworksMemoryDesc* desc) __INTRODUCED_IN(30);
+
+/**
+ * Specify that a memory object will be playing the role of an input to an execution created from a
+ * particular compilation.
+ *
+ * The compilation and the input index fully specify an input operand. This function
+ * may be invoked multiple times on the same memory descriptor with different input operands,
+ * and the same input operand may be specified on multiple memory descriptors. However,
+ * specifying the same input operand on the same memory descriptor more than once will
+ * return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each other. Two
+ * dimensions are incompatible if both ranks are fully specified but have different values, or if
+ * there is at least one axis that is fully specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory descriptor
+ * before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished by calling
+ *                    {@link ANeuralNetworksCompilation_finish}, and must outlive the memory
+ *                    descriptor.
+ * @param index The index of the input argument we are referencing from the compilation. It is
+ *              an index into the inputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0]. Describes how likely the
+ *                  memory is to be used in the specified role. This is provided as a hint to
+ *                  optimize the case when different roles prefer different memory locations or data
+ *                  layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_addInputRole(ANeuralNetworksMemoryDesc* desc,
+                                           const ANeuralNetworksCompilation* compilation,
+                                           uint32_t index, float frequency) __INTRODUCED_IN(30);
+
+/**
+ * Specify that a memory object will be playing the role of an output to an execution created from a
+ * particular compilation.
+ *
+ * The compilation and the output index fully specify an output operand. This function
+ * may be invoked multiple times on the same memory descriptor with different output operands,
+ * and the same output operand may be specified on multiple memory descriptors. However,
+ * specifying the same output operand on the same memory descriptor object more than once will
+ * return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each other. Two
+ * dimensions are incompatible if both ranks are fully specified but have different values, or if
+ * there is at least one axis that is fully specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the memory descriptor
+ * before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished by calling
+ *                    {@link ANeuralNetworksCompilation_finish}, and must outlive the memory
+ *                    descriptor.
+ * @param index The index of the output argument we are referencing from the compilation. It is
+ *              an index into the outputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0]. Describes how likely the
+ *                  memory is to be used in the specified role. This is provided as a hint to
+ *                  optimize the case when multiple roles prefer different memory locations or data
+ *                  layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_addOutputRole(ANeuralNetworksMemoryDesc* desc,
+                                            const ANeuralNetworksCompilation* compilation,
+                                            uint32_t index, float frequency) __INTRODUCED_IN(30);
+
+/**
+ * Set the dimensional information of the memory descriptor.
+ *
+ * The specified dimensions must be compatible with the dimensions of the corresponding model
+ * operands of all the roles specified by {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are incompatible if both ranks
+ * are fully specified but have different values, or if there is at least one axis that is fully
+ * specified in both but has different values.
+ *
+ * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param rank The number of dimensions. Must be 0 for scalars.
+ * @param dimensions An array of dimensions. An entry with the value 0 indicates that the
+ *                   corresponding axis has an unknown size.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_setDimensions(ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+                                            const uint32_t* dimensions) __INTRODUCED_IN(30);
+
+/**
+ * Indicate that we have finished modifying a memory descriptor. Required before calling
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * This function must only be called once for a given memory descriptor.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_finish(ANeuralNetworksMemoryDesc* desc) __INTRODUCED_IN(30);
+
+/**
+ * Creates a memory object from a memory descriptor.
+ *
+ * The memory object is created with an uninitialized buffer. A memory object with an uninitialized
+ * buffer may only be used according to the roles specified by {@link
+ * ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination memory in {@link
+ * ANeuralNetworksMemory_copy}. The buffer of a memory object is initialized after the memory object
+ * is used as an output in a successful execution, or used as the destination memory in a successful
+ * {@link ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may be used
+ * according to all roles specified in {@link ANeuralNetworksMemoryDesc}, or as the source or
+ * destination memory in {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will
+ * return to the uninitialized state if the memory object is used as an output in a failed
+ * execution, or used as the destination memory in a failed {@link ANeuralNetworksMemory_copy}.
+ *
+ * The dimensions of the memory descriptor are deduced from the dimensions of the corresponding
+ * model operands of all the roles specified by {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions set by the call to
+ * {@link ANeuralNetworksMemoryDesc_setDimensions}, if any. The memory descriptor may have
+ * unspecified dimensions or rank. In such a case, the same memory object may be used with different
+ * shapes of outputs in different executions. When the memory is used as an input, the input shape
+ * must be the same as the output shape from the last execution using this memory object as an
+ * output, or the last {@link ANeuralNetworkMemory_copy} using this memory object as the destination
+ * memory. Creating a memory object with unspecified dimensions or rank may fail for certain sets of
+ * roles.
+ *
+ * Using the memory in roles or shapes that are not compatible with the rules specified above will
+ * return an error.
+ *
+ * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory object,
+ * both offset and length must be set to zero and the entire memory region will be
+ * associated with the specified input or output operand.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the memory created from this
+ * function will return an error.
+ *
+ * {@link ANeuralNetworksMemory_free} must be called once the memory is no longer needed.
+ *
+ * Attempting to create memory from an unfinished memory descriptor will return an error.
+ *
+ * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the {@link ANeuralNetworksMemory}
+ * object.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED if the memory is
+ *         created with unspecified dimensions or rank and it is not supported for this set of
+ *         roles.
+ */
+int ANeuralNetworksMemory_createFromDesc(const ANeuralNetworksMemoryDesc* desc,
+                                         ANeuralNetworksMemory** memory) __INTRODUCED_IN(30);
+
+/**
+ * Copies data from one memory object to another.
+ *
+ * If at most one of the src and dst is created from {@link ANeuralNetworksMemory_createFromDesc},
+ * the src and dst must have the same logical size:
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd}, or if it is created
+ *   from {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with format of
+ *   AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the memory.
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a
+ *   format other than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size when there is
+ *   no padding and the data is tightly packed. This function may fail if the AHardwareBuffer
+ *   cannot be accessed.
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromDesc}, the logical size
+ *   equals the size indicated by the {@link OperandCode} multiplied by the number of elements. This
+ *   function will fail if the number of elements is unknown.
+ *
+ * If both src and dst are created from {@link ANeuralNetworksMemory_createFromDesc}, they must have
+ * compatible dimensions. Two dimensions are incompatible if both ranks are fully specified but
+ * have different values, or if there is at least one axis that is fully specified in both but has
+ * different values. The dst may have unspecified dimensions or rank. In such a case, the dimensions
+ * of dst will get updated according to the dimensions of the src.
+ *
+ * In both cases, if the src is created from {@link ANeuralNetworksMemory_createFromDesc}, it must
+ * have been used as an output in a successful execution, or used as the destination memory in a
+ * successful {@link ANeuralNetworksMemory_copy}.
+ *
+ * The src and dst may have different data layout, in which case the data copying is performed
+ * logically with data layout transformation.
+ *
+ * Available since API level 30.
+ *
+ * @param src The source memory object.
+ * @param dst The destination memory object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemory_copy(const ANeuralNetworksMemory* src, const ANeuralNetworksMemory* dst)
+        __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
+#if __ANDROID_API__ >= 29
+
 /**
  * Get the number of available devices.
  *
@@ -5359,7 +6623,8 @@ int ANeuralNetworks_getDevice(uint32_t devIndex, ANeuralNetworksDevice** device)
  * @param device The representation of the specified device.
  * @param name   The returned name of the specified device. The name will be in UTF-8
  *               and will be null-terminated. It will be recognizable as a known device name
- *               rather than a cryptic string. For devices with feature level 29 and above, the
+ *               rather than a cryptic string. For devices with feature level reported by
+ *               {@link ANeuralNetworksDevice_getFeatureLevel} that is 29 and above, the
  *               format of the name is {VENDOR}-{DEVICE}. For devices with feature level 28
  *               or lower, the format of the name is undefined.
  *               The name will remain valid for the duration of the application.
@@ -5439,6 +6704,26 @@ int ANeuralNetworksDevice_getVersion(const ANeuralNetworksDevice* device, const
 int ANeuralNetworksDevice_getFeatureLevel(const ANeuralNetworksDevice* device,
                                           int64_t* featureLevel) __INTRODUCED_IN(29);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Wait until the device is in a live state.
+ *
+ * A device may encounter internal errors and temporarily enter a dead state. A
+ * call that uses a device in such a state will return with the error
+ * {@link ANEURALNETWORKS_DEAD_OBJECT}. ANeuralNetworksDevice_wait will block until
+ * the device is in a live state.
+ *
+ * @param device The representation of the specified device.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksDevice_wait(const ANeuralNetworksDevice* device) __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Get the supported operations for a specified set of devices. If multiple devices
  * are selected, the supported operation list is a union of supported operations of all
@@ -5473,6 +6758,10 @@ int ANeuralNetworksModel_getSupportedOperationsForDevices(
  * ANeuralNetworksCompilation_create}, where the runtime will attempt to recover
  * from such failures.
  *
+ * The model passed to this function is termed the "main model" of the
+ * compilation, to distinguish it from other models referred to by an Operand
+ * of type {@link ANEURALNETWORKS_MODEL} within this compilation.
+ *
  * @param model The {@link ANeuralNetworksModel} to be compiled.
  * @param devices The set of devices. Must not contain duplicates.
  * @param numDevices The number of devices in the set.
@@ -5502,7 +6791,7 @@ int ANeuralNetworksCompilation_createForDevices(ANeuralNetworksModel* model,
  *                 data. It is recommended to use the code cache directory provided
  *                 by the Android runtime. If not using the code cache directory, the
  *                 user should choose a directory local to the application, and is
- *                 responsible to managing the cache entries.
+ *                 responsible for managing the cache entries.
  * @param token The token provided by the user to specify a model must be of length
  *              ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that
  *              the token is unique to a model within the application. The NNAPI
@@ -5525,10 +6814,24 @@ int ANeuralNetworksCompilation_setCaching(ANeuralNetworksCompilation* compilatio
  * execution has completed and the outputs are ready to be consumed.
  * </p>
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on this execution,
+ * and the execution is not able to complete before the timeout duration is
+ * exceeded, then execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned. If the device has
+ * a feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel}
+ * that is lower than 30, then the timeout duration hint will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned.
+ *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
- * See {@link ANeuralNetworksExecution_startCompute} for asynchronous execution.
- * Synchronous execution incurs lower overhead than asynchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution.
+ * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution.
+ * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for
+ * asynchronous execution with dependencies.
  *
  * Available since API level 29.
  *
@@ -5544,9 +6847,10 @@ int ANeuralNetworksExecution_compute(ANeuralNetworksExecution* execution) __INTR
  * Get the dimensional information of the specified output operand of the model of the
  * {@link ANeuralNetworksExecution}.
  *
- * On asynchronous execution initiated by {@link ANeuralNetworksExecution_startCompute},
- * {@link ANeuralNetworksEvent_wait} must be called prior to this function to recuperate
- * the resources used by the execution.
+ * The execution must have completed.  On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function.
  *
  * @param execution The execution to be queried.
  * @param index The index of the output argument we are querying. It is
@@ -5569,9 +6873,10 @@ int ANeuralNetworksExecution_getOutputOperandRank(ANeuralNetworksExecution* exec
  * Get the dimensional information of the specified output operand of the model of the
  * {@link ANeuralNetworksExecution}. The target output operand cannot be a scalar.
  *
- * On asynchronous execution initiated by {@link ANeuralNetworksExecution_startCompute},
- * {@link ANeuralNetworksEvent_wait} must be called prior to this function to recuperate
- * the resources used by the execution.
+ * The execution must have completed.  On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function.
  *
  * @param execution The execution to be queried.
  * @param index The index of the output argument we are querying. It is an index into the lists
@@ -5625,11 +6930,28 @@ void ANeuralNetworksBurst_free(ANeuralNetworksBurst* burst) __INTRODUCED_IN(29);
  * <p>Schedules synchronous evaluation of the execution. Returns once the
  * execution has completed and the outputs are ready to be consumed.</p>
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on the execution,
+ * and the execution is not able to complete before the timeout duration is
+ * exceeded, then execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned.
+ *
+ * If the execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned. If the device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the
+ * timeout duration hint will be ignored.
+ *
  * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
  * any given time for any given burst object. Any
  * {@link ANeuralNetworksExecution} launched before the previous has finished
  * will result in ANEURALNETWORKS_BAD_STATE.</p>
  *
+ * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
+ * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution.
+ * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for
+ * asynchronous execution with dependencies.
+ *
  * Available since API level 29.
  *
  * @param burst The burst object to execute on.
@@ -5656,14 +6978,14 @@ int ANeuralNetworksExecution_burstCompute(ANeuralNetworksExecution* execution,
  * offset and length must be set to zero and the entire memory region will be
  * associated with the specified input or output operand. There is no guarantee
  * that an arbitrary AHardwareBuffer_Format and AHardwareBuffer_UsageFlags combination
- * can be used by arbitrary devices. The execution will fail if selected set of devices
- * cannot consume the buffer.
+ * can be used by arbitrary devices. The execution will fail if the selected set of
+ * devices cannot consume the buffer.
  *
  * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with shared memory
  * backed by an AHardwareBuffer of a format other than AHARDWAREBUFFER_FORMAT_BLOB is
  * disallowed.
  *
- * TODO(miaowang): add documentation about intended usage with introspection API.
+ * The provided AHardwareBuffer must outlive the ANeuralNetworksMemory object.
  *
  * Available since API level 29.
  *
@@ -5686,8 +7008,12 @@ int ANeuralNetworksMemory_createFromAHardwareBuffer(const AHardwareBuffer* ahwb,
  *
  * By default, duration is not measured.
  *
- * The {@link ANeuralNetworksExecution} must have been created with
+ * The {@link ANeuralNetworksExecution} must have been created from an
+ * {@link ANeuralNetworksCompilation} which in turn was created from
  * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+ * If the device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 29, then the
+ * duration will not be measured.
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
@@ -5702,41 +7028,32 @@ int ANeuralNetworksExecution_setMeasureTiming(ANeuralNetworksExecution* executio
         __INTRODUCED_IN(29);
 
 /**
- * Different duration measurements.
- *
- * Durations are measured in nanoseconds.
- *
- * Available since API level 29.
- */
-typedef enum {
-    // Execution time on hardware (not driver, which runs on host processor).
-    ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
-    // Execution time in driver (including time on hardware).  Excludes overhead
-    // such as that of the runtime itself and the IPC needed for the runtime to
-    // communicate with the driver.
-    ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
-} DurationCode;
-
-/**
  * Get the time spent in the specified {@link ANeuralNetworksExecution}, in nanoseconds.
- * The execution must have completed.
  *
- * Available since API level 29.
+ * The execution must have completed.  On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function.
  *
  * @param execution The execution to be queried.
  * @param durationCode The measurement to be queried, specified by {@link DurationCode}.
  * @param duration The returned duration. If no measurement was requested by
- *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for some other
- *                 reason the duration is not available, UINT64_MAX will be returned.
- *                 A particular device need not support any given measurement.
+ *                 {@link ANeuralNetworksExecution_setMeasureTiming}, if the
+ *                 device is has a feature level reported by
+ *                 {@link ANeuralNetworksDevice_getFeatureLevel} that is lower
+ *                 than 29, or for some other reason the duration is not
+ *                 available, UINT64_MAX will be returned. A particular device
+ *                 need not support any given measurement.
  *
  * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
  */
 int ANeuralNetworksExecution_getDuration(const ANeuralNetworksExecution* execution,
                                          int32_t durationCode, uint64_t* duration)
         __INTRODUCED_IN(29);
 
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  // __ANDROID_API__ >= 29
 
 #if __ANDROID_API__ >= 27
 
@@ -5776,7 +7093,8 @@ int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd, size_t
  *
  * Available since API level 27.
  *
- * @param memory The memory object to be freed.
+ * @param memory The memory object to be freed. Passing NULL is acceptable and
+ *               results in no operation.
  */
 void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) __INTRODUCED_IN(27);
 
@@ -5784,8 +7102,10 @@ void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) __INTRODUCED_IN(2
  * Create an empty {@link ANeuralNetworksModel}.
  *
  * <p>This only creates the object. Computation is performed once
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} is invoked.
  *
  * The model should be constructed with calls to
  * {@link ANeuralNetworksModel_addOperation} and
@@ -5826,8 +7146,8 @@ void ANeuralNetworksModel_free(ANeuralNetworksModel* model) __INTRODUCED_IN(27);
  * calling {@link ANeuralNetworksCompilation_create} and
  * {@link ANeuralNetworksCompilation_createForDevices}.
  *
- * An application is responsible to make sure that no other thread uses
- * the model at the same time.
+ * An application must ensure that no other thread uses the model at the same
+ * time.
  *
  * This function must only be called once for a given model.
  *
@@ -5901,11 +7221,13 @@ int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
  * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}
  * are immediately copied into the model.
  *
- * For values of length greater than {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES},
- * a pointer to the buffer is stored within the model. The application is responsible
- * for not changing the content of this region until all executions using this model
- * have completed. As the data may be copied during processing, modifying the data
- * after this call yields undefined results.
+ * For values of length greater than
+ * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}, a pointer to
+ * the buffer is stored within the model. The application must not change the
+ * content of this region until all executions using this model have
+ * completed. As the data may be copied during processing, modifying the data
+ * after this call yields undefined results. The provided buffer must outlive
+ * this model.
  *
  * For large tensors, using {@link ANeuralNetworksModel_setOperandValueFromMemory}
  * is likely to be more efficient.
@@ -5930,7 +7252,7 @@ int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
 int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index,
                                          const void* buffer, size_t length) __INTRODUCED_IN(27);
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 
 /**
  * Sets an operand's per channel quantization parameters.
@@ -5955,28 +7277,33 @@ int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
         ANeuralNetworksModel* model, int32_t index,
         const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) __INTRODUCED_IN(29);
 
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  // __ANDROID_API__ >= 29
 
 /**
  * Sets an operand to a value stored in a memory object.
  *
  * The content of the memory is not copied. A reference to that memory is stored
- * inside the model. The application is responsible for not changing the content
- * of the memory region until all executions using this model have completed.
- * As the data may be copied during processing, modifying the data after this call
- * yields undefined results.
+ * inside the model. The application must not change the content of the memory
+ * region until all executions using this model have completed.  As the data may
+ * be copied during processing, modifying the data after this call yields
+ * undefined results.
+ *
+ * <p>The provided memory must outlive this model.</p>
  *
  * To indicate that an optional operand should be considered missing,
  * use {@link ANeuralNetworksModel_setOperandValue} instead, passing nullptr for buffer.
  *
- * Is disallowed to set an operand value with shared memory backed by an AHardwareBuffer
+ * It is disallowed to set an operand value with shared memory backed by an AHardwareBuffer
  * of a format other than AHARDWAREBUFFER_FORMAT_BLOB.
  *
+ * It is disallowed to set an operand value with memory created from
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
  * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
  * called will return an error.
  *
  * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on
+ * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on
  * AHardwareBuffer usage.
  *
  * Available since API level 27.
@@ -5996,6 +7323,39 @@ int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model,
                                                    size_t offset, size_t length)
         __INTRODUCED_IN(27);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Sets an operand to a value that is a reference to another NNAPI model.
+ *
+ * The referenced model must already have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * The {@link ANeuralNetworksModel_relaxComputationFloat32toFloat16} setting of
+ * referenced models is overridden by that setting of the main model of a
+ * compilation.
+ *
+ * The referenced model must outlive the model referring to it.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param value The model to be referenced.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_setOperandValueFromModel(ANeuralNetworksModel* model, int32_t index,
+                                                  const ANeuralNetworksModel* value)
+        __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Add an operation to a model.
  *
@@ -6060,6 +7420,9 @@ int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, u
  * must be calculated using at least the range and precision of the IEEE 754
  * 32-bit floating-point format.
  *
+ * The relaxComputationFloat32toFloat16 setting of the main model of
+ * a compilation overrides the values of the referenced models.
+ *
  * @param model The model to be modified.
  * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
  *              calculated with range and/or precision as low as that of the
@@ -6083,7 +7446,11 @@ int ANeuralNetworksModel_relaxComputationFloat32toFloat16(ANeuralNetworksModel*
 /**
  * Create a {@link ANeuralNetworksCompilation} to compile the given model.
  *
- * <p>This only creates the object. Compilation is only performed once
+ * The model passed to this function is termed the "main model" of the
+ * compilation, to distinguish it from other models referred to by an Operand
+ * of type {@link ANEURALNETWORKS_MODEL} within this compilation.
+ *
+ * <p>This function only creates the object. Compilation is only performed once
  * {@link ANeuralNetworksCompilation_finish} is invoked.</p>
  *
  * <p>{@link ANeuralNetworksCompilation_finish} should be called once
@@ -6114,7 +7481,7 @@ int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model,
  * Destroy a compilation.
  *
  * The compilation need not have been finished by a call to
- * {@link ANeuralNetworksModel_finish}.
+ * {@link ANeuralNetworksCompilation_finish}.
  *
  * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
  *
@@ -6128,7 +7495,8 @@ void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation) __
 /**
  * Sets the execution preference.
  *
- * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+ * <p>Provides guidance to the runtime when trade-offs are possible. By default the runtime
+ * uses PREFER_SINGLE_FAST_ANSWER</p>
  *
  * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
  *
@@ -6146,13 +7514,19 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compila
 
 /**
  * Indicate that we have finished modifying a compilation. Required before
- * calling {@link ANeuralNetworksExecution_create}.
+ * calling {@link ANeuralNetworksBurst_create} or
+ * {@link ANeuralNetworksExecution_create}.
  *
- * An application is responsible to make sure that no other thread uses
- * the compilation at the same time.
+ * An application must ensure that no other thread uses the compilation at the
+ * same time.
  *
  * This function must only be called once for a given compilation.
  *
+ * If {@link ANeuralNetworksCompilation_setTimeout} was called on this
+ * compilation, and the compilation is not able to be finished before the
+ * timeout duration is exceeded, then compilation may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned.
+ *
  * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
  *
  * Available since API level 27.
@@ -6163,11 +7537,85 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compila
  */
 int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation) __INTRODUCED_IN(27);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Set the execution priority.
+ *
+ * Execution priorities are relative to other executions created by the same
+ * application (specifically same uid) for the same device. Specifically,
+ * priorities of executions from one application will not affect executions from
+ * another application. Similarly, priorities of executions on one device will
+ * not affect executions on another device.
+ *
+ * Higher priority executions may use more compute resources than lower priority
+ * executions, and may preempt or starve lower priority executions.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param compilation The compilation to be modified.
+ * @param priority The relative priority of the execution compared to other
+ *     executions created by the application. Must be one of
+ *     ANEURALNETWORKS_PRIORITY_*.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksCompilation_setPriority(ANeuralNetworksCompilation* compilation, int priority)
+        __INTRODUCED_IN(30);
+
+/**
+ * Set the maximum expected duration for compiling the model.
+ *
+ * If the device is not able to complete the compilation within the specified
+ * duration, the compilation may be aborted. The timeout duration begins at the
+ * call to {@link ANeuralNetworksCompilation_finish}.
+ *
+ * This timeout duration acts as a hint to drivers, and can be used to both free
+ * up compute resources within the driver and return control back to the
+ * application quicker than is possible without the hint. It enables drivers
+ * that are able to estimate how long a compilation will take to abort the
+ * compilation before it has even started if the driver believes the compilation
+ * cannot be completed within the timeout duration. Similarly, it enables
+ * drivers to abort an ongoing compilation if it is taking too long. However,
+ * this call does not guarantee that the compilation will complete or abort
+ * within the timeout duration.
+ *
+ * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+ * the timeout duration for compiling the model is considered infinite.
+ *
+ * The {@link ANeuralNetworksCompilation} must have been created with
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+ * device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the
+ * timeout duration hint will be ignored.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *     be spent finishing a compilation. If this duration is exceeded, the
+ *     compilation may be aborted. If set to 0, the timeout duration is
+ *     considered infinite.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksCompilation_setTimeout(ANeuralNetworksCompilation* compilation,
+                                          uint64_t duration) __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
  * This only creates the object. Computation is only performed once
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} is invoked.
  *
  * <p>The provided compilation must outlive the execution.</p>
  *
@@ -6187,12 +7635,16 @@ int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation,
 /**
  * Destroy an execution.
  *
- * <p>If called on an execution for which
- * {@link ANeuralNetworksExecution_startCompute} has been called, the
- * function will return immediately but will mark the execution to be deleted
- * once the computation completes. The related {@link ANeuralNetworksEvent}
- * will be signaled and the {@link ANeuralNetworksEvent_wait} will return
- * ANEURALNETWORKS_ERROR_DELETED.
+ * <p>The execution need not have been scheduled by a call to
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies}; but if it has been scheduled,
+ * then the application must not call {@link ANeuralNetworksExecution_free}
+ * until the execution has completed (i.e.,
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute}, or
+ * {@link ANeuralNetworksEvent_wait} has returned).
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
@@ -6206,7 +7658,10 @@ void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) __INTROD
 /**
  * Associate a user buffer with an input of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the buffer until the execution has
+ * completed. Evaluation of the execution will not change the content of the
+ * buffer.
  *
  * <p>The provided buffer must outlive the execution.</p>
  *
@@ -6244,9 +7699,12 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32
                                       size_t length) __INTRODUCED_IN(27);
 
 /**
- * Associate part of a memory object with an input of the model of the
+ * Associate a region of a memory object with an input of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the region until the execution has
+ * completed. Evaluation of the execution will not change the content of the
+ * region.
  *
  * <p>The provided memory must outlive the execution.</p>
  *
@@ -6255,8 +7713,10 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32
  * buffer and 0 for length.
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on
+ * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on
  * AHardwareBuffer usage.
+ * See {@link ANeuralNetworksMemory_createFromDesc} for information on usage of memory objects
+ * created from memory descriptors.
  *
  * Available since API level 27.
  *
@@ -6290,7 +7750,9 @@ int ANeuralNetworksExecution_setInputFromMemory(ANeuralNetworksExecution* execut
 /**
  * Associate a user buffer with an output of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the buffer until the execution has
+ * completed.
  *
  * If the output is optional, you can indicate that it is omitted by
  * passing nullptr for buffer and 0 for length.
@@ -6333,9 +7795,11 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int3
                                        size_t length) __INTRODUCED_IN(27);
 
 /**
- * Associate part of a memory object with an output of the model of the
+ * Associate a region of a memory object with an output of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the region until the execution has
+ * completed.
  *
  * If the output is optional, you can indicate that it is omitted by
  * using {@link ANeuralNetworksExecution_setOutput} instead, passing nullptr for
@@ -6344,8 +7808,10 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int3
  * <p>The provided memory must outlive the execution.</p>
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on
+ * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on
  * AHardwareBuffer usage.
+ * See {@link ANeuralNetworksMemory_createFromDesc} for information on usage of memory objects
+ * created from memory descriptors.
  *
  * Available since API level 27.
  *
@@ -6385,8 +7851,8 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu
 /**
  * Schedule asynchronous evaluation of the execution.
  *
- * <p>Schedules asynchronous evaluation of the execution. Once the model has
- * been applied and the outputs are ready to be consumed, the returned event
+ * <p>Schedules asynchronous evaluation of the execution. Once the execution
+ * has completed and the outputs are ready to be consumed, the returned event
  * will be signaled. Use {@link ANeuralNetworksEvent_wait} to wait for that
  * event.
  * </p>
@@ -6394,10 +7860,31 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu
  * ANeuralNetworksEvent_wait must be called to recuperate the resources used
  * by the execution.
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on this execution,
+ * and the execution is not able to complete before the timeout duration is
+ * exceeded, then execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned through
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksEvent_wait} on the event object. If the device has a
+ * feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that
+ * is lower than 30, then the timeout duration hint will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned through {@link ANeuralNetworksEvent_wait} on the event
+ * object.
+ *
+ * If the device can detect before the execution has started that the execution
+ * will not complete within the timeout duration, the device may choose to skip
+ * the execution and instead return {@link ANEURALNETWORKS_MISSED_DEADLINE_*}.
+ *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
  * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
- * Synchronous execution incurs lower overhead than asynchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution.
+ * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for
+ * asynchronous execution with dependencies.
  *
  * Available since API level 27.
  *
@@ -6405,21 +7892,129 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu
  * @param event The event that will be signaled on completion. event is set to
  *              NULL if there's an error.
  *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
+ * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled.
  */
 int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution,
                                           ANeuralNetworksEvent** event) __INTRODUCED_IN(27);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Set the maximum expected duration of the specified execution.
+ *
+ * If the device is not able to complete the execution within the specified
+ * duration, the execution may be aborted. The timeout duration begins at a
+ * call to one of:
+ * - {@link ANeuralNetworksExecution_burstCompute}
+ * - {@link ANeuralNetworksExecution_compute}
+ * - {@link ANeuralNetworksExecution_startCompute}
+ * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+ *
+ * This timeout duration acts as a hint to drivers, and can be used to both free
+ * up compute resources within the driver and return control back to the
+ * application quicker than is possible without the hint. It enables drivers
+ * that are able to estimate how long an execution will take to abort the
+ * execution before it has even started if the driver believes the execution
+ * cannot be completed within the timeout duration. Similarly, it enables
+ * drivers to abort an ongoing execution if it is taking too long. However, this
+ * call does not guarantee that the execution will complete or abort within the
+ * timeout duration.
+ *
+ * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+ * the timeout duration for execution is considered infinite.
+ *
+ * The {@link ANeuralNetworksExecution} must have been created from an
+ * {@link ANeuralNetworksCompilation} which in turn was created from
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+ * device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the
+ * timeout duration hint will be ignored.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *     be spent executing a model. If this duration is exceeded, the execution
+ *     may be aborted. If set to 0, the timeout duration is considered infinite.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksExecution_setTimeout(ANeuralNetworksExecution* execution, uint64_t duration)
+        __INTRODUCED_IN(30);
+
+/**
+ * Set the maximum duration of WHILE loops in the specified execution.
+ *
+ * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+ *
+ * If a WHILE loop condition model does not output false within the specified
+ * duration, the execution will be aborted.
+ *
+ * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+ * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+ * and maximum timeout values.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param duration The maximum amount of time in nanoseconds that can be spent
+ *     executing a WHILE loop. If the specified duration value exceeds the value
+ *     produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+ *     overridden by that value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *         ANEURALNETWORKS_BAD_STATE if execution has started.
+ *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksExecution_setLoopTimeout(ANeuralNetworksExecution* execution, uint64_t duration)
+        __INTRODUCED_IN(30);
+
+/**
+ * Get the default timeout value for WHILE loops.
+ *
+ * @return The default timeout value in nanoseconds.
+ *
+ * Available since API level 30.
+ */
+uint64_t ANeuralNetworks_getDefaultLoopTimeout() __INTRODUCED_IN(30);
+
+/**
+ * Get the maximum timeout value for WHILE loops.
+ *
+ * @return The maximum timeout value in nanoseconds.
+ *
+ * Available since API level 30.
+ */
+uint64_t ANeuralNetworks_getMaximumLoopTimeout() __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Waits until the execution completes.
  *
  * More than one thread can wait on an event. When the execution completes,
  * all threads will be released.
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on the execution
+ * corresponding to this event, and the execution is not able to complete
+ * before the duration is exceeded, the execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned here.
+ *
+ * If the execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * the execution will be aborted, and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned here.
+ *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
  * Available since API level 27.
  *
+ * @param event The event that will be signaled on completion.
  * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
  *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory cannot
  *         be properly mapped.
@@ -6432,13 +8027,140 @@ int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) __INTRODUCED_IN(27);
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
  * Available since API level 27.
+ *
+ * @param event The event object to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
  */
 void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) __INTRODUCED_IN(27);
 
 #endif  // __ANDROID_API__ >= 27
 
+#if __ANDROID_API__ >= 30
+/**
+ * Create a {@link ANeuralNetworksEvent} from a sync_fence file descriptor.
+ *
+ * The newly created ANeuralNetworksEvent does not take ownership of the provided sync_fence_fd,
+ * it will instead dup the provided sync_fence_fd and own the duplicate.
+ *
+ * @param sync_fence_fd The sync_fence file descriptor.
+ * @param event The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksEvent_createFromSyncFenceFd(int sync_fence_fd, ANeuralNetworksEvent** event)
+        __INTRODUCED_IN(30);
+
+/**
+ * Get sync_fence file descriptor from the event.
+ *
+ * If the ANeuralNetworksEvent is not backed by a sync fence, the sync_fence_fd
+ * will be set to -1, and ANEURALNETWORKS_BAD_DATA will be returned.
+ *
+ * See {@link ANeuralNetworksEvent_createFromSyncFenceFd} and
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} to see how to create
+ * an event backed by a sync fence.
+ *
+ * The user takes ownership of the returned fd, and must close the returned file descriptor when
+ * it is no longer needed.
+ *
+ * @param event An event that is backed by a sync fence.
+ * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will
+ *                      be set to -1 if there is an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksEvent_getSyncFenceFd(const ANeuralNetworksEvent* event, int* sync_fence_fd)
+        __INTRODUCED_IN(30);
+
+/**
+ * Schedule asynchronous evaluation of the execution with dependencies.
+ *
+ * The execution will wait for all the depending events to be signaled before
+ * starting the evaluation. Once the execution has completed and the outputs
+ * are ready to be consumed, the returned event will be signaled. Depending on which
+ * devices are handling the execution, the event could be backed by a sync fence.
+ * Use {@link ANeuralNetworksEvent_wait} to wait for that event.
+ *
+ * ANeuralNetworksEvent_wait must be called to recurperate the resources used
+ * by the execution.
+ *
+ * If parts of the execution are scheduled on devices that do not support fenced execution,
+ * the function call may wait for such parts to finish before returning.
+ *
+ * The function will return an error if any of the events in dependencies is already in a bad
+ * state. After the execution is scheduled, if any of the events in dependencies does not complete
+ * normally, the execution will fail, and {@link ANeuralNetworksEvent_wait} on the returned
+ * event will return an error.
+ *
+ * The function will return an error if any of the execution outputs has a tensor operand type
+ * that is not fully specified.
+ *
+ * The function can be passed a timeout duration in nanoseconds. This timeout
+ * duration acts as a hint to drivers in the same way that the timeout durations
+ * in {@link ANeuralNetworksCompilation_setTimeout} and {@link
+ * ANeuralNetworksExecution_setTimeout} act as hints to drivers. The duration
+ * begins when all waitFor sync fences have been signaled, and can be used
+ * together with {@link ANeuralNetworksExecution_setTimeout} which specifies the
+ * maximum timeout duration beginning at the call to
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies}.
+ * If the duration is non-zero, the {@link ANeuralNetworksExecution} must have been created
+ * from an {@link ANeuralNetworksCompilation} which in turn was created from
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If either
+ * the timeout duration from {@link ANeuralNetworksExecution_setTimeout} or the
+ * timeout duration passed to this call is exceeded, the execution may be
+ * aborted, in which case {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be
+ * returned through {@link ANeuralNetworksExecution_startComputeWithDependencies}
+ * or {@link ANeuralNetworksEvent_wait} on the event object. If the device has a
+ * feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that
+ * is lower than 30, then the timeout duration hints will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned through {@link ANeuralNetworksEvent_wait} on the event
+ * object.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution.
+ * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution.
+ *
+ * @param execution The execution to be scheduled and executed.
+ * @param dependencies A set of depending events. The actual evaluation will not start
+ *                     until all the events are signaled.
+ * @param num_dependencies The number of events in the dependencies set.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *                 be spent executing the model after all dependencies are
+ *                 signaled. If set to 0, the timeout duration is considered
+ *                 infinite.
+ * @param event The event that will be signaled on completion. event is set to
+ *              NULL if there's an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksExecution_startComputeWithDependencies(
+        ANeuralNetworksExecution* execution, const ANeuralNetworksEvent* const* dependencies,
+        uint32_t num_dependencies, uint64_t duration, ANeuralNetworksEvent** event)
+        __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 __END_DECLS
 
-#endif  // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+
+// For compatibility with android, check __ANDROID__ is defined
+#ifndef __ANDROID__
+#undef __ANDROID_API__
+#undef __INTRODUCED_IN
+#endif // __ANDROID__
 
 /** @} */
diff --git a/runtime/nnapi-header/include/NeuralNetworksExtensions.h b/runtime/nnapi-header/include/NeuralNetworksExtensions.h
index ca2e04567..dd51b0301 100644
--- a/runtime/nnapi-header/include/NeuralNetworksExtensions.h
+++ b/runtime/nnapi-header/include/NeuralNetworksExtensions.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
-#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
+#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
+#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
 
 #include "NeuralNetworks.h"
 
@@ -37,7 +37,7 @@
 
 __BEGIN_DECLS
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 
 /**
  * Queries whether an extension is supported by the driver implementation of the specified device.
@@ -110,8 +110,8 @@ int ANeuralNetworksModel_setOperandExtensionData(ANeuralNetworksModel* model, in
                                                  const void* data, size_t length)
         __INTRODUCED_IN(29);
 
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  // __ANDROID_API__ >= 29
 
 __END_DECLS
 
-#endif  // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
+#endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
diff --git a/runtime/onert/api/CMakeLists.txt b/runtime/onert/api/CMakeLists.txt
index 49a5aa071..9c6dd90cc 100644
--- a/runtime/onert/api/CMakeLists.txt
+++ b/runtime/onert/api/CMakeLists.txt
@@ -9,10 +9,16 @@ add_library(${ONERT_DEV} SHARED ${API_SRC})
 set(NNFW_API_HEADERS include/nnfw.h include/nnfw_experimental.h)
 
 target_link_libraries(${ONERT_DEV} PUBLIC nnfw-nnapi-header)
-target_link_libraries(${ONERT_DEV} PUBLIC onert_core)
+target_link_libraries(${ONERT_DEV} PRIVATE onert_core)
 target_link_libraries(${ONERT_DEV} PRIVATE jsoncpp tflite_loader circle_loader ${LIB_PTHREAD})
 target_link_libraries(${ONERT_DEV} PRIVATE nnfw_common)
 target_link_libraries(${ONERT_DEV} PRIVATE nnfw_coverage)
+# NOTE Below line is added to remove warning for android build
+#      It will be removed after android build uses gold linker
+if (ANDROID)
+  target_link_libraries(${ONERT_DEV} INTERFACE log)
+endif (ANDROID)
+
 target_include_directories(${ONERT_DEV} PUBLIC include)
 set_target_properties(${ONERT_DEV} PROPERTIES PUBLIC_HEADER "${NNFW_API_HEADERS}")
 
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 42e43760b..8c6ea3994 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01000900
+#define NNFW_VERSION 0x01000a00
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index 81b40703f..aa066e190 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -112,7 +112,16 @@ NNFW_STATUS nnfw_session::load_circle_from_buffer(uint8_t *buffer, size_t size)
   if (size == 0)
     return NNFW_STATUS_ERROR;
 
-  _subgraphs = onert::circle_loader::loadModel(buffer, size);
+  try
+  {
+    _subgraphs = onert::circle_loader::loadModel(buffer, size);
+  }
+  catch (const std::exception &e)
+  {
+    std::cerr << "Error during model loading : " << e.what() << std::endl;
+    return NNFW_STATUS_ERROR;
+  }
+
   _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs);
 
   _state = State::MODEL_LOADED;
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
index 31f1c10eb..b45b91058 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.cc
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <AclActivationBuilder.h>
+#include <AclFunction.h>
+#include <Convert.h>
+#include <Swizzle.h>
+
 #include "ConstantInitializer.h"
 
 namespace onert
@@ -96,6 +101,46 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node)
   }
 }
 
+void ConstantInitializer::visit(const ir::operation::Reverse &node)
+{
+  const auto &output_index = node.getOutputs().at(0);
+
+  const auto &input_index = node.getInputs().at(ir::operation::Reverse::Input::INPUT);
+  const auto &input_obj = _operands.at(input_index);
+
+  const auto &axis_index = node.getInputs().at(ir::operation::Reverse::Input::AXIS);
+  const auto &axis_obj = _operands.at(axis_index);
+
+  const auto ifm_rank = input_obj.shape().rank();
+  const auto frontend_layout = this->_current_op_seq_layout;
+
+  auto output_tensor = this->_tensor_reg->getITensor(output_index);
+  const auto backend_layout = output_tensor->layout();
+
+  if (axis_obj.isConstant())
+  {
+    _init_map[axis_index] = [ifm_rank, frontend_layout, backend_layout](const ir::Operand &operand,
+                                                                        backend::ITensor &obj) {
+      assert(operand.data());
+
+      const auto axis_value = *(reinterpret_cast<const int32_t *>(operand.data()->base()));
+      int32_t axis_tmp = axis_value;
+      if (axis_tmp < 0)
+      {
+        axis_tmp = axis_tmp + ifm_rank;
+      }
+
+      auto axis =
+          acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout, backend_layout).value();
+
+      obj.access([&](ITensor &tensor) {
+        int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer());
+        *into = (int32_t)axis;
+      });
+    };
+  }
+}
+
 } // namespace acl_cl
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.h b/runtime/onert/backend/acl_cl/ConstantInitializer.h
index 4f894fd31..9f3acb461 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.h
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.h
@@ -38,6 +38,7 @@ public:
   void visit(const ir::operation::Gather &) final;
   void visit(const ir::operation::HashtableLookup &) final;
   void visit(const ir::operation::SpaceToBatchND &) final;
+  void visit(const ir::operation::Reverse &) final;
 };
 
 } // namespace acl_cl
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index 94489253d..cc9afcaeb 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -78,9 +78,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 
   assert(_ctx.at(block_size_index).data());
 
@@ -98,9 +98,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   const auto act_info = acl_common::asActivationLayerInfo(activation);
 
@@ -164,10 +164,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -202,10 +202,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -240,7 +240,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
   std::vector<::arm_compute::ICLTensor *> input_tensors;
   for (auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
@@ -268,7 +268,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
   const auto activation = node.param().activation;
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
@@ -286,8 +286,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto keep_dims{node.param().keep_dims};
   const auto reduce_type = node.param().reduce_type;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
@@ -320,8 +320,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
@@ -351,8 +351,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
   _return_fn = asAclFunction(std::move(fn));
@@ -365,8 +365,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
@@ -382,8 +382,8 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -449,8 +449,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -523,10 +523,23 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     strides_set.set(i, strides[i]);
   }
 
+  // Disable applied dim_correction
+  if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(inputData_tensor);
+  }
+
   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
       begin_mask, end_mask, shrink_axis_mask);
 
+  // Revert disabling applied dim_correction
+  if (inputData_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(inputData_tensor);
+  }
+
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -534,22 +547,47 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 {
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
-  const auto &perm{node.param().perm};
+  const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 
   const auto rank = _ctx.at(ifm_idx).shape().rank();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
 
-  std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
-  // Reversed
-  auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
-      rank, pv, frontend_layout, backend_layout);
+  const auto &perms = _ctx.at(perm_idx);
+  std::vector<int32_t> pv;
+  if (perms.shape() == ir::Shape{0})
+  {
+    pv.resize(rank);
+    std::iota(pv.begin(), pv.end(), 0);
+    std::reverse(pv.begin(), pv.end());
+  }
+  else
+  {
+    pv = _ctx.at(perm_idx).asVector<int32_t>();
+  }
 
-  auto fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
-                                                                ofm_tensor->handle(), backend_pv);
+  std::unique_ptr<arm_compute::IFunction> fn;
+  if (rank == 1)
+  {
+    fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
+  }
+  else if (rank == 2)
+  {
+    assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
+    fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
+                                                             ofm_tensor->handle());
+  }
+  else
+  {
+    auto backend_pv =
+        acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
+
+    fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
+                                                           ofm_tensor->handle(), backend_pv);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -559,8 +597,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
       node.param().op_type, node.param().alpha, node.param().beta);
@@ -577,9 +615,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -626,8 +664,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -647,7 +685,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
       {
         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
                                                             output_tensor->handle());
-        ;
+      }
+      else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
+      {
+        fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
+                                                                output_tensor->handle());
       }
       else
       {
@@ -719,8 +761,8 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
@@ -735,10 +777,10 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
-  auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
+  auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
@@ -764,9 +806,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
-  auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
+  auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
@@ -775,6 +817,56 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
+void KernelGenerator::visit(const ir::operation::OneHot &node)
+{
+  const auto output_idx{node.getOutputs().at(0)};
+  const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
+  const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
+  const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
+  const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
+  const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
+  assert(depth > 0);
+
+  auto output_tensor = _tensor_reg->getAclTensor(output_idx);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
+  auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
+
+  const size_t output_rank = _ctx.at(output_idx).shape().rank();
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto backend_layout = output_tensor->layout();
+  int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
+  axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
+
+  if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and output_tensor is applied dim_correction
+    acl_common::disableDimCorrection(output_tensor);
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  const auto &offvalue = _ctx.at(offvalue_idx);
+  if (offvalue.isConstant())
+  {
+    fn = acl_common::generateLayer<arm_compute::CLOneHot>(
+        indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
+        acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
+  }
+  else
+  {
+    auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
+    fn = acl_common::generateLayer<arm_compute::CLOneHot>(
+        indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
+        output_tensor->handle(), static_cast<uint32_t>(depth), axis);
+  }
+
+  if (output_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(output_tensor);
+  }
+
+  _return_fn = asAclFunction(std::move(fn));
+}
+
 void KernelGenerator::visit(const ir::operation::Pack &node)
 {
   const auto output_index{node.getOutputs().at(0)};
@@ -786,41 +878,39 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : node.getInputs())
     input_indexes.emplace_back(input_index);
 
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
   std::vector<arm_compute::ICLTensor *> inputs;
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 
   if (axis < 0)
     axis += output_rank;
   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes;
   for (const auto &input_index : input_indexes)
   {
-    size_t input_rank = _ctx.at(input_index).shape().rank();
     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
-    orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
-    assert(input_rank == input_tensor->num_dimensions());
-    if (input_rank != input_tensor->info()->num_dimensions())
+    if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
     {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+      // This means that high dimension's value is 1 and input tensor is applied dim_correction
+      acl_common::disableDimCorrection(input_tensor);
     }
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
 
   // Revert disabling applied dim_correction
-  assert(inputs.size() == orig_inputs_acl_tensor_shapes.size());
-  for (size_t i = 0; i < inputs.size(); ++i)
+  for (const auto &input_index : input_indexes)
   {
-    inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i));
+    const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
+    if (input_tensor->dimension(0) == 1)
+    {
+      acl_common::enableDimCorrection(input_tensor);
+    }
   }
 
   _return_fn = asAclFunction(std::move(fn));
@@ -833,7 +923,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
       acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   const auto activation = node.param().activation;
   _return_fn = std::make_unique<exec::FunctionSequence>(
       asAclFunction(std::move(raw_fn)),
@@ -845,8 +935,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -879,11 +969,10 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
@@ -896,11 +985,10 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
       ifm_tensor->handle(), ofm_tensor->handle(),
@@ -925,14 +1013,14 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
-  auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
-  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
-  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
+  auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
+  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
+  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
@@ -954,10 +1042,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
-  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
+  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
@@ -976,8 +1064,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
@@ -991,9 +1079,9 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
@@ -1020,8 +1108,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
@@ -1041,12 +1129,12 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
@@ -1061,9 +1149,9 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
@@ -1096,9 +1184,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
@@ -1116,9 +1204,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
@@ -1140,9 +1228,9 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
 
   const auto k = node.param().k;
 
-  auto values_tensor = _tensor_reg->getAclTensor(outputValues_index).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(inputData_index).get();
+  auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
+  auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
+  auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
       input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
@@ -1162,9 +1250,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
@@ -1187,29 +1275,29 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   assert(k == indices_tensor->num_dimensions());
 
   // Disable applied dim_correction
-  const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
   if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    const auto ifm = _ctx.at(ifm_index);
-    ifm_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(ifm_tensor);
   }
-  const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
   if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
-    const auto indices = _ctx.at(indices_index);
-    indices_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(indices_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // Revert disabling applied dim_correction
-  ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
-  indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+  if (ifm_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(ifm_tensor);
+  }
+  if (indices_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(indices_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1218,19 +1306,20 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
 
   auto ifm_shape = _ctx.at(ifm_index).shape();
   auto ofm_shape = _ctx.at(ofm_index).shape();
 
   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   auto frontend_layout = _current_op_seq_layout;
   auto backend_layout = ifm_tensor->layout();
 
-  int axis_value = node.param().axis;
+  int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis_value < 0)
   {
     axis_value += ifm_rank;
@@ -1239,7 +1328,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
   auto acl_axis =
       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
 
-  auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>(
+  auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
       ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
       ::arm_compute::ReductionOperation::ARG_IDX_MAX);
 
@@ -1257,8 +1346,8 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
@@ -1277,8 +1366,8 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
       input_tensor->handle(), output_tensor->handle(), block_size);
@@ -1289,22 +1378,27 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
 void KernelGenerator::visit(const ir::operation::Split &node)
 {
   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
 
   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+  if (!_ctx.at(axis_index).isConstant())
+  {
+    throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
+  }
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   std::vector<ir::OperandIndex> output_indexes;
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   std::vector<arm_compute::ICLTensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
-  auto axis = node.param().axis;
+  auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
     axis += ifm_rank;
   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
@@ -1315,6 +1409,60 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
+void KernelGenerator::visit(const ir::operation::SplitV &node)
+{
+  const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
+  const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
+  const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
+
+  assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+
+  const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
+  std::vector<ir::OperandIndex> output_indexes;
+  for (const auto &output : node.getOutputs())
+    output_indexes.emplace_back(output);
+
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
+
+  std::vector<arm_compute::ICLTensor *> output_tensors;
+  for (const auto &ofm_ind : output_indexes)
+    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
+
+  auto fn = std::make_unique<arm_compute::CLSplitVEx>();
+  const auto &split_dim_op = _ctx.at(split_dim_index);
+  if (split_dim_op.isConstant())
+  {
+    int32_t split_dim = split_dim_op.asScalar<int32_t>();
+    uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
+    const auto frontend_layout = _current_op_seq_layout;
+    const auto backend_layout = ifm_tensor->layout();
+
+    if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
+    {
+      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+      acl_common::disableDimCorrection(ifm_tensor);
+    }
+
+    split_dim_revised =
+        acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
+            .value();
+    fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
+                  output_tensors, node.param().num_splits);
+
+    if (ifm_tensor->dimension(0) == 1)
+    {
+      acl_common::enableDimCorrection(ifm_tensor);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
+  }
+
+  _return_fn = asAclFunction(std::move(fn));
+}
+
 void KernelGenerator::visit(const ir::operation::Unpack &node)
 {
   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
@@ -1326,34 +1474,32 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : node.getOutputs())
     output_indexes.emplace_back(output_index);
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   std::vector<arm_compute::ICLTensor *> outputs;
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
   if (axis < 0)
     axis += input_rank;
   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
-  for (const auto &output_index : output_indexes)
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
-    size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
-    assert(output_rank == output_tensor->num_dimensions());
-    if (output_rank != output_tensor->info()->num_dimensions())
-    {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
-    }
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(input_tensor);
   }
 
-  auto fn = acl_common::generateLayer<arm_compute::CLUnstack>(input, outputs, axis);
+  auto fn =
+      acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
+
+  // Revert disabling applied dim_correction
+  if (input_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(input_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1373,11 +1519,11 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto input = _tensor_reg->getAclTensor(input_index)->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
 
   ::arm_compute::PaddingList padding_list;
   padding_list.resize(rank);
@@ -1391,21 +1537,26 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   }
 
   // Disable applied dim_correction
-  size_t input_rank = _ctx.at(input_index).shape().rank();
   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
-  assert(input_rank == input_tensor->num_dimensions());
-  if (input_rank != input_tensor->info()->num_dimensions())
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
-    // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-        _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(input_tensor);
   }
 
   auto fn =
       acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
 
-  // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension
-  // It would produce a mistach of result
+  // NOTE Do not revert disabling applied dim_correction for 4D.
+  // It would produce a mistach of result by incorrect offset_first_element in
+  // ICLKernel::add_tensor_argument<3>().
+  // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
+  // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
+  // used tensor is 4D and the tensor's high dimention is 1
+  if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(input_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1415,8 +1566,8 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
@@ -1429,8 +1580,8 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
@@ -1438,6 +1589,30 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
+void KernelGenerator::visit(const ir::operation::Reverse &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
+
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
+
+  // WORKAROUND: acl-cl backend only allow U32 type for axis
+  //             ConstantInitializer will resolve S32 type to U32 type
+  if (_ctx.at(axis_index).isConstant() &&
+      (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
+  {
+    axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
+  }
+
+  auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
+      ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
+
+  _return_fn = asAclFunction(std::move(fn));
+}
+
 } // namespace acl_cl
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.h b/runtime/onert/backend/acl_cl/KernelGenerator.h
index d188d6d83..e8a922677 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.h
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.h
@@ -59,6 +59,7 @@ public:
   void visit(const ir::operation::InstanceNorm &) override;
   void visit(const ir::operation::Comparison &) override;
   void visit(const ir::operation::LSTM &) override;
+  void visit(const ir::operation::OneHot &) override;
   void visit(const ir::operation::Pack &) override;
   void visit(const ir::operation::Pool2D &) override;
   void visit(const ir::operation::Permute &) override;
@@ -79,10 +80,12 @@ public:
   void visit(const ir::operation::LocalResponseNormalization &) override;
   void visit(const ir::operation::DepthToSpace &) override;
   void visit(const ir::operation::Split &) override;
+  void visit(const ir::operation::SplitV &) override;
   void visit(const ir::operation::Unpack &) override;
   void visit(const ir::operation::Pad &) override;
   void visit(const ir::operation::ConvertFp32ToFp16 &) override;
   void visit(const ir::operation::ConvertFp16ToFp32 &) override;
+  void visit(const ir::operation::Reverse &) override;
 
 private:
   const ir::Operands &_ctx;
diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
index 372ce689e..257bbd3b4 100644
--- a/runtime/onert/backend/acl_common/AclKernelGen.h
+++ b/runtime/onert/backend/acl_common/AclKernelGen.h
@@ -30,6 +30,20 @@ namespace backend
 namespace acl_common
 {
 
+void enableDimCorrection(IACLTensor *tensor)
+{
+  size_t input_rank = tensor->num_dimensions();
+  const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape())
+      .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), true);
+}
+
+void disableDimCorrection(IACLTensor *tensor)
+{
+  size_t input_rank = tensor->num_dimensions();
+  const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape())
+      .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), false);
+}
+
 template <typename Layer, typename... Args>
 std::unique_ptr<arm_compute::IFunction> generateLayer(Args &&... args)
 {
@@ -138,30 +152,27 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   const auto projection_clip = projection_threshold;
   assert(cell_clip >= 0.f && projection_clip >= 0.f);
 
-  auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index).get();
-  auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index).get();
-  auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index).get();
-  auto output_tensor = tensor_reg->getAclTensor(output_index).get();
+  auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index);
+  auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index);
+  auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index);
+  auto output_tensor = tensor_reg->getAclTensor(output_index);
 
-  auto input_tensor = tensor_reg->getAclTensor(input_index).get();
+  auto input_tensor = tensor_reg->getAclTensor(input_index);
 
-  auto input_to_forget_weights_tensor =
-      tensor_reg->getAclTensor(input_to_forget_weights_index).get();
-  auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index).get();
-  auto input_to_output_weights_tensor =
-      tensor_reg->getAclTensor(input_to_output_weights_index).get();
+  auto input_to_forget_weights_tensor = tensor_reg->getAclTensor(input_to_forget_weights_index);
+  auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index);
+  auto input_to_output_weights_tensor = tensor_reg->getAclTensor(input_to_output_weights_index);
   auto recurrent_to_forget_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_forget_weights_index).get();
-  auto recurrent_to_cell_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_cell_weights_index).get();
+      tensor_reg->getAclTensor(recurrent_to_forget_weights_index);
+  auto recurrent_to_cell_weights_tensor = tensor_reg->getAclTensor(recurrent_to_cell_weights_index);
   auto recurrent_to_output_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_output_weights_index).get();
+      tensor_reg->getAclTensor(recurrent_to_output_weights_index);
 
-  auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index).get();
-  auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index).get();
-  auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index).get();
-  auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index).get();
-  auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index).get();
+  auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index);
+  auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index);
+  auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index);
+  auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index);
+  auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index);
 
   auto act_info = asActivationLayerInfo(activation);
 
@@ -169,13 +180,13 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   if (has_cifg_param)
   {
     auto input_to_input_weights_tensor =
-        tensor_reg->getAclTensor(input_to_input_weights_index).get(); // optional
+        tensor_reg->getAclTensor(input_to_input_weights_index); // optional
     auto recurrent_to_input_weights_tensor =
-        tensor_reg->getAclTensor(recurrent_to_input_weights_index).get(); // optional
+        tensor_reg->getAclTensor(recurrent_to_input_weights_index); // optional
     auto cell_to_input_weights_handle =
-        has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index).get()->handle()
+        has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index)->handle()
                            : nullptr; // optional (non-cifg && peephole)
-    auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index).get(); // optional
+    auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index); // optional
     lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
                                 recurrent_to_input_weights_tensor->handle(),
                                 cell_to_input_weights_handle, input_gate_bias_tensor->handle());
@@ -183,19 +194,18 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   if (has_peephole_param)
   {
     auto cell_to_forget_weights_tensor =
-        tensor_reg->getAclTensor(cell_to_forget_weights_index).get(); // optional
+        tensor_reg->getAclTensor(cell_to_forget_weights_index); // optional
     auto cell_to_output_weights_tensor =
-        tensor_reg->getAclTensor(cell_to_output_weights_index).get(); // optional
+        tensor_reg->getAclTensor(cell_to_output_weights_index); // optional
     lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
                                     cell_to_output_weights_tensor->handle());
   }
   if (has_projection_param)
   {
-    auto projection_weights_tensor =
-        tensor_reg->getAclTensor(projection_weights_index).get(); // optional
-    auto projection_bias_handle =
-        has_projection_bias ? tensor_reg->getAclTensor(projection_bias_index).get()->handle()
-                            : nullptr; // optional
+    auto projection_weights_tensor = tensor_reg->getAclTensor(projection_weights_index); // optional
+    auto projection_bias_handle = has_projection_bias
+                                      ? tensor_reg->getAclTensor(projection_bias_index)->handle()
+                                      : nullptr; // optional
     lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
   }
 
@@ -260,10 +270,10 @@ kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Ope
     reshape.dim(1) = input_size; /* W */
   }
 
-  auto output_tensor = tensor_reg->getAclTensor(output_index).get();
-  const auto input_tensor = tensor_reg->getAclTensor(input_index).get();
-  const auto weight_tensor = tensor_reg->getAclTensor(weight_index).get();
-  const auto bias_tensor = tensor_reg->getAclTensor(bias_index).get();
+  auto output_tensor = tensor_reg->getAclTensor(output_index);
+  const auto input_tensor = tensor_reg->getAclTensor(input_index);
+  const auto weight_tensor = tensor_reg->getAclTensor(weight_index);
+  const auto bias_tensor = tensor_reg->getAclTensor(bias_index);
   const auto frontend_layout = layout;
   const auto acl_layout = output_tensor->handle()->info()->data_layout();
 
@@ -313,8 +323,8 @@ kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands,
   VERBOSE(Pool2DParam) << "PAD(L): " << padding.left << std::endl;
   VERBOSE(Pool2DParam) << "PAD(R): " << padding.right << std::endl;
 
-  auto ofm_tensor = tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = tensor_reg->getAclTensor(ifm_index);
 
   ::arm_compute::PoolingLayerInfo info{
       pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(),
diff --git a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
index 83d7ad6fd..beec95718 100644
--- a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
+++ b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
@@ -61,8 +61,14 @@ public:
 
     for (const auto &ind : inputs)
     {
-      // NOTE Not support the case that concat's input is a constant or a input of model
-      if (_graph.operands().at(ind).isConstant() || _graph.getInputs().contains(ind))
+      /**
+       * NOTE Not support below cases.
+       * 1. concat's input is a constant.
+       * 2. concat's input is a input of model.
+       * 3. concat's input already becomes a subtensor of another concat.
+       */
+      if (_graph.operands().at(ind).isConstant() || _graph.getInputs().contains(ind) ||
+          _parent_map.find(ind) != _parent_map.end())
       {
         return;
       }
diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h
index 91452014b..bb7abc95d 100644
--- a/runtime/onert/backend/acl_common/AclTensorBuilder.h
+++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h
@@ -70,8 +70,6 @@ public:
   void allocate() override;
   void postFunctionPrepare() override;
 
-  std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override;
-
   T_AclTensorManager *acl_tensor_manager(void) { return _tensor_mgr.get(); }
 
   void setUsesCount(const ir::OperandIndex &index, size_t num_uses)
@@ -161,7 +159,6 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::registerTensorInfo(
   else
   {
     // SubTensors
-
     assert(!info.isConstant() && "Subtensors of constants are not supported yet.");
 
     // Update offset info and emplace
@@ -306,13 +303,6 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::postFunctionPrepare(voi
 }
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
-std::unique_ptr<ITensorManager>
-AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::releaseStaticTensorManager(void)
-{
-  return std::move(_tensor_mgr);
-}
-
-template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
 {
   assert(_tensor_mgr->constTensors().size() == 0);
diff --git a/runtime/onert/backend/acl_common/AclTensorRegistry.h b/runtime/onert/backend/acl_common/AclTensorRegistry.h
index 1ef9f4b35..02d66db99 100644
--- a/runtime/onert/backend/acl_common/AclTensorRegistry.h
+++ b/runtime/onert/backend/acl_common/AclTensorRegistry.h
@@ -36,17 +36,11 @@ template <typename T_AclTensorManager> class AclTensorRegistry : public ITensorR
 public:
   AclTensorRegistry(T_AclTensorManager *tensor_mgr) : _tensor_mgr{tensor_mgr} {}
 
-  std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
-  {
-    return _tensor_mgr->at(ind);
-  }
+  ITensor *getITensor(const ir::OperandIndex &ind) override { return _tensor_mgr->at(ind).get(); }
 
-  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
-  {
-    return getITensor(ind);
-  }
+  ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getITensor(ind); }
 
-  auto getAclTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind); }
+  auto getAclTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind).get(); }
 
 private:
   T_AclTensorManager *_tensor_mgr;
diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc
index 67dcc8192..6ef6a2dc3 100644
--- a/runtime/onert/backend/acl_common/Convert.cc
+++ b/runtime/onert/backend/acl_common/Convert.cc
@@ -112,6 +112,8 @@ namespace acl_common
       return ::arm_compute::DataType::S8;
     case ir::DataType::FLOAT16:
       return ::arm_compute::DataType::F16;
+    case ir::DataType::INT64:
+      return ::arm_compute::DataType::S64;
     default:
       throw std::runtime_error("Not supported, yet");
       break;
@@ -299,6 +301,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type)
       return ir::DataType::QUANT_INT8_SYMM;
     case ::arm_compute::DataType::F16:
       return ir::DataType::FLOAT16;
+    case ::arm_compute::DataType::S64:
+      return ir::DataType::INT64;
     default:
       throw std::runtime_error{"Not supported, yet"};
       break;
@@ -335,6 +339,27 @@ arm_compute::ReduceOperation convertReduceType(ir::operation::Reduce::ReduceType
   }
 }
 
+arm_compute::PixelValue asPixelValue(const ir::Operand &operand)
+{
+  assert(operand.isConstant());
+  assert(operand.shape().num_elements() == 1);
+  switch (operand.typeInfo().type())
+  {
+    case ir::DataType::INT32:
+      return arm_compute::PixelValue(operand.asScalar<int32_t>());
+    case ir::DataType::INT64:
+      return arm_compute::PixelValue(operand.asScalar<int64_t>());
+    case ir::DataType::UINT32:
+      return arm_compute::PixelValue(operand.asScalar<uint64_t>());
+    case ir::DataType::UINT8:
+      return arm_compute::PixelValue(operand.asScalar<uint8_t>());
+    case ir::DataType::FLOAT32:
+      return arm_compute::PixelValue(operand.asScalar<float>());
+    default:
+      throw std::runtime_error("asPixelValue : Not supported datatype yet");
+  }
+}
+
 } // namespace acl_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_common/Convert.h b/runtime/onert/backend/acl_common/Convert.h
index 380321c07..0b36df102 100644
--- a/runtime/onert/backend/acl_common/Convert.h
+++ b/runtime/onert/backend/acl_common/Convert.h
@@ -17,6 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_COMMON_CONVERT_H__
 #define __ONERT_BACKEND_ACL_COMMON_CONVERT_H__
 
+#include <arm_compute/core/PixelValue.h>
 #include <arm_compute/core/TensorInfo.h>
 #include <arm_compute/core/SubTensorInfo.h>
 #include <arm_compute/core/TensorShape.h>
@@ -85,6 +86,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type);
 arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir);
 arm_compute::ReduceOperation convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir);
 
+arm_compute::PixelValue asPixelValue(const ir::Operand &operand);
+
 } // namespace acl_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index 6d53c1245..598d043e7 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -18,7 +18,6 @@
 
 #include <arm_compute/runtime/NEON/NEFunctions.h>   // Include all ARM Compute NEON functions
 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
-#include <arm_compute/runtime/CPP/functions/CPPOneHotEx.h>
 
 #include <AclActivationBuilder.h>
 #include <AclFunction.h>
@@ -75,15 +74,16 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   auto frontend_layout = _current_op_seq_layout;
   auto backend_layout = ifm_tensor->layout();
 
-  int axis_value = node.param().axis;
+  int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis_value < 0)
   {
     axis_value += ifm_rank;
@@ -106,9 +106,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 
   assert(_ctx.at(block_size_index).data());
 
@@ -126,9 +126,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().arithmetic_type)
@@ -190,10 +190,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -214,8 +214,8 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
       input_tensor->handle(), output_tensor->handle(), block_size);
@@ -245,10 +245,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -282,7 +282,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
   std::vector<::arm_compute::ITensor *> input_tensors;
   for (const auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
@@ -312,8 +312,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
       node.param().op_type, node.param().alpha, node.param().beta);
@@ -343,9 +343,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -390,8 +390,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -412,6 +412,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
         fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
                                                             output_tensor->handle());
       }
+      else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
+      {
+        fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
+                                                                output_tensor->handle());
+      }
       else
       {
         fn = acl_common::generateLayer<arm_compute::NECast>(
@@ -480,9 +485,9 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
@@ -493,7 +498,7 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
   const auto activation = node.param().activation;
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
@@ -512,12 +517,12 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
@@ -539,9 +544,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   // Converting in reverse order
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
   const auto backend_layout = ofm_tensor->layout();
   UNUSED_RELEASE(backend_layout);
 
@@ -567,24 +572,26 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    const auto ifm = _ctx.at(ifm_index);
-    ifm_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(ifm_tensor);
   }
   if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
-    const auto indices = _ctx.at(indices_index);
-    indices_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(indices_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
-  // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
-  // use arm_compute::TensorInfo::offset_element_in_bytes()
-  // It would create an error when the kernel accesses high dimension that its value is 1
+  // Revert disabling applied dim_correction
+  if (ifm_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(ifm_tensor);
+  }
+  if (indices_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(indices_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -596,10 +603,10 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
-  auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
+  auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
@@ -630,8 +637,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
@@ -653,8 +660,8 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
@@ -682,13 +689,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : node.getInputs())
     input_indexes.emplace_back(input_index);
 
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
   std::vector<arm_compute::ITensor *> inputs;
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 
   if (axis < 0)
     axis += output_rank;
@@ -697,22 +704,25 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   // Disable applied dim_correction
   for (const auto &input_index : input_indexes)
   {
-    size_t input_rank = _ctx.at(input_index).shape().rank();
     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
-    assert(input_rank == input_tensor->num_dimensions());
-    if (input_rank != input_tensor->info()->num_dimensions())
+    if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
     {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+      // This means that high dimension's value is 1 and input tensor is applied dim_correction
+      acl_common::disableDimCorrection(input_tensor);
     }
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
 
-  // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
-  // use arm_compute::TensorInfo::offset_element_in_bytes()
-  // It would create an error when the kernel accesses high dimension that its value is 1
+  // Revert disabling applied dim_correction
+  for (const auto &input_index : input_indexes)
+  {
+    const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
+    if (input_tensor->dimension(0) == 1)
+    {
+      acl_common::enableDimCorrection(input_tensor);
+    }
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -727,8 +737,8 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   auto rank = _ctx.at(input_index).shape().rank();
   auto pad_base = _ctx.at(pad_index).data()->base();
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto input = _tensor_reg->getAclTensor(input_index)->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
 
   ::arm_compute::PaddingList padding_list;
   padding_list.resize(rank);
@@ -737,7 +747,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 
     const auto frontend_layout = _current_op_seq_layout;
-    const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+    const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
     const auto axis =
         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
@@ -764,7 +774,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
       acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   const auto activation = node.param().activation;
   _return_fn = std::make_unique<exec::FunctionSequence>(
       asAclFunction(std::move(raw_fn)),
@@ -776,8 +786,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -812,9 +822,9 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
@@ -828,8 +838,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
@@ -866,8 +876,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
@@ -887,11 +897,10 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
@@ -916,14 +925,14 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
-  auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
-  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
-  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
+  auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
+  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
+  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
@@ -949,8 +958,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
   _return_fn = asAclFunction(std::move(fn));
@@ -962,25 +971,26 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
   const auto beta = node.param().beta;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = input_tensor->layout();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // Disable applied dim_correction
-  const size_t input_rank = _ctx.at(input_index).shape().rank();
-  if (input_rank != input_tensor->info()->num_dimensions())
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and input tensor is applied dim_correction
-    const auto input = _ctx.at(input_index);
-    input_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
+    acl_common::disableDimCorrection(input_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
       output_tensor->handle(), beta);
 
+  // Revert disabling applied dim_correction
+  if (input_tensor->dimension(0) == 1)
+  {
+    acl_common::disableDimCorrection(input_tensor);
+  }
+
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -992,10 +1002,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
-  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
+  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
@@ -1014,8 +1024,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
@@ -1027,22 +1037,27 @@ void KernelGenerator::visit(const ir::operation::Split &node)
 {
   // TODO Support this op by SubTensor
   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
 
   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+  if (!_ctx.at(axis_index).isConstant())
+  {
+    throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
+  }
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   std::vector<ir::OperandIndex> output_indexes;
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   std::vector<arm_compute::ITensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
-  auto axis = node.param().axis;
+  auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
     axis += ifm_rank;
   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
@@ -1059,9 +1074,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
@@ -1076,8 +1091,8 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -1141,8 +1156,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -1211,10 +1226,23 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     strides_set.set(i, strides[i]);
   }
 
+  // Disable applied dim_correction
+  if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(inputData_tensor);
+  }
+
   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
       begin_mask, end_mask, shrink_axis_mask);
 
+  // Revert disabling applied dim_correction
+  if (inputData_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(inputData_tensor);
+  }
+
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -1244,9 +1272,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
@@ -1261,26 +1289,43 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 {
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
-  const auto &perm{node.param().perm};
+  const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
-
   const auto rank = _ctx.at(ifm_idx).shape().rank();
-  std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
-  auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
-      rank, pv, frontend_layout, backend_layout);
 
-  std::unique_ptr<::arm_compute::IFunction> fn;
-  if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
+  const auto &perms = _ctx.at(perm_idx);
+  std::vector<int32_t> pv;
+  if (perms.shape() == ir::Shape{0})
+  {
+    pv.resize(rank);
+    std::iota(pv.begin(), pv.end(), 0);
+    std::reverse(pv.begin(), pv.end());
+  }
+  else
+  {
+    pv = _ctx.at(perm_idx).asVector<int32_t>();
+  }
+
+  std::unique_ptr<arm_compute::IFunction> fn;
+  if (rank == 1)
   {
+    fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
+  }
+  else if (rank == 2)
+  {
+    assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
     fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
                                                              ofm_tensor->handle());
   }
   else
   {
+    auto backend_pv =
+        acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
+
     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
                                                            ofm_tensor->handle(), backend_pv);
   }
@@ -1298,34 +1343,32 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : node.getOutputs())
     output_indexes.emplace_back(output_index);
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   std::vector<arm_compute::ITensor *> outputs;
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
   if (axis < 0)
     axis += input_rank;
   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
-  for (const auto &output_index : output_indexes)
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
-    size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
-    assert(output_rank == output_tensor->num_dimensions());
-    if (output_rank != output_tensor->info()->num_dimensions())
-    {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
-    }
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(input_tensor);
   }
 
-  auto fn = acl_common::generateLayer<arm_compute::NEUnstack>(input, outputs, axis);
+  auto fn =
+      acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
+
+  // Revert disabling applied dim_correction
+  if (input_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(input_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1335,8 +1378,8 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
@@ -1352,9 +1395,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
-  auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
+  auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
@@ -1370,15 +1413,20 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
-  const auto axis = node.param().axis;
 
-  auto output_tensor = _tensor_reg->getAclTensor(out_idx).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(indices_idx).get();
-  auto depth_tensor = _tensor_reg->getAclTensor(depth_idx).get();
-  auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx).get();
-  auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx).get();
+  auto output_tensor = _tensor_reg->getAclTensor(out_idx);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
+  auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
+  auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
+  auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
+
+  const size_t output_rank = _ctx.at(out_idx).shape().rank();
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto backend_layout = output_tensor->layout();
+  int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
+  axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 
-  auto fn = acl_common::generateLayer<arm_compute::CPPOneHotEx>(
+  auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
       indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
       offvalue_tensor->handle(), output_tensor->handle(), axis);
   _return_fn = asAclFunction(std::move(fn));
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
index 6627412d2..32e249f5a 100644
--- a/runtime/onert/backend/cpu/ExternalContext.h
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -39,16 +39,13 @@ public:
   ExternalContext() : _ruy_context(new ruy::Context)
   {
     setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
-#ifdef USE_RUY_GEMV
-    _ruy_context->cache_policy = ruy::kCacheLHSOnNarrowMul;
-#endif
   }
 
   void setMaxNumThreads(int max_num_threads)
   {
     const int target_num_threads =
         max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
-    _ruy_context->max_num_threads = target_num_threads;
+    _ruy_context->set_max_num_threads(target_num_threads);
   }
 
   ruy::Context *ruy_context() const { return _ruy_context.get(); }
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 74b6f0c6b..5f330ff50 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -232,12 +232,10 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
     dyn_ctx->op_seq = &op_seq;
     dyn_ctx->operations = &_operations_ctx;
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->tensor_registry = _tensor_reg;
     dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
 
     _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
   }
-  _return_fn_seq->enableDynamicShapeInferer(true);
 
   _current_op_seq_layout = op_seq.getLayout();
   for (const auto &operation_idx : op_seq.operations())
@@ -272,10 +270,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
 
   const auto stride = node.param().stride;
   const auto activation = node.param().activation;
@@ -332,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
 
   auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
 
@@ -353,11 +351,11 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   const auto rank = _ctx.at(ofm_index).shape().rank();
   const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   auto fn = std::make_unique<ops::ConcatLayer>();
 
@@ -372,9 +370,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto input_index{node.getInputs().at(ir::operation::BatchToSpaceND::INPUT)};
   const auto block_size_index{node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE)};
 
-  auto output_alloc = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_alloc = _tensor_reg->getPortableTensor(input_index).get();
-  auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index).get();
+  auto output_alloc = _tensor_reg->getPortableTensor(output_index);
+  auto input_alloc = _tensor_reg->getPortableTensor(input_index);
+  auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index);
 
   auto fn = std::make_unique<ops::BatchToSpaceNDLayer>();
 
@@ -384,7 +382,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   if (node.getInputs().size() != NNApiInputs)
   {
     const auto crops_data_index{node.getInputs().at(ir::operation::BatchToSpaceND::CROPS_DATA)};
-    crops_alloc = _tensor_reg->getPortableTensor(crops_data_index).get();
+    crops_alloc = _tensor_reg->getPortableTensor(crops_data_index);
   }
 
   fn->configure(input_alloc, output_alloc, block_size_alloc, crops_alloc);
@@ -398,9 +396,9 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
   const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
   const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto value_tensor = _tensor_reg->getPortableTensor(value_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto value_tensor = _tensor_reg->getPortableTensor(value_index);
 
   auto fn = std::make_unique<ops::FillLayer>();
 
@@ -419,11 +417,10 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
   const auto activation = node.param().activation;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index).get();
-  auto bias_tensor =
-      bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
+  auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
 
   auto fn = std::make_unique<ops::FullyConnectedLayer>();
 
@@ -438,8 +435,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   // optional 2nd input
   IPortableTensor *shape_tensor = nullptr;
@@ -447,7 +444,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   if (node.getInputs().size() == 2)
   {
     const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
-    shape_tensor = _tensor_reg->getPortableTensor(shape_index).get();
+    shape_tensor = _tensor_reg->getPortableTensor(shape_index);
   }
 
   auto fn = std::make_unique<ops::ReshapeLayer>();
@@ -461,8 +458,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   // Squeeze can share same kernel with reshape
   auto fn = std::make_unique<ops::ReshapeLayer>();
@@ -479,8 +476,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::SoftMaxLayer>();
 
@@ -497,9 +494,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::BinaryArithmeticLayer>();
 
@@ -515,9 +512,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto comparison_type = node.param().comparison_type;
 
@@ -534,9 +531,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
 
   const auto backend_layout = output_tensor->layout();
   UNUSED_RELEASE(backend_layout);
@@ -575,11 +572,11 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
 
   const auto axis = node.param().axis;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index).get();
-  auto depth_tensor = _tensor_reg->getPortableTensor(depth_index).get();
-  auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index).get();
-  auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
+  auto depth_tensor = _tensor_reg->getPortableTensor(depth_index);
+  auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index);
+  auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index);
 
   assert(indices_tensor->data_type() == OperandType::INT32);
   assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
@@ -595,10 +592,10 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   const auto equation = node.param().equation;
 
@@ -613,7 +610,7 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
 {
   auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
                           std::vector<custom::TypeInfo> &types,
-                          std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
+                          std::vector<IPortableTensor *> &tensors) {
     for (auto &idx : opSeq)
     {
       const auto &operand = _ctx.at(idx);
@@ -642,8 +639,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::ElementwiseActivationLayer>();
 
@@ -659,9 +656,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::ElementwiseBinaryLayer>();
 
@@ -676,8 +673,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::ElementwiseUnaryLayer>();
 
@@ -692,9 +689,9 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ExpandDimsLayer>();
 
@@ -712,11 +709,11 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 
   assert(-rank <= axis && axis < rank);
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   auto fn = std::make_unique<ops::PackLayer>();
 
@@ -734,11 +731,11 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
 
   assert(rank == 0 || (-rank <= axis && axis < rank));
 
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   std::vector<IPortableTensor *> output_tensors;
   for (auto &output_idx : node.getOutputs())
-    output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get());
+    output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::UnpackLayer>();
 
@@ -756,8 +753,8 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   const auto output_index{node.getOutputs().at(0)};
   assert(_ctx.at(pad_index).data());
 
-  auto input = _tensor_reg->getPortableTensor(input_index).get();
-  auto output = _tensor_reg->getPortableTensor(output_index).get();
+  auto input = _tensor_reg->getPortableTensor(input_index);
+  auto output = _tensor_reg->getPortableTensor(output_index);
   auto pad_rank = _ctx.at(pad_index).shape().dim(0);
   auto pad_base = reinterpret_cast<const int32_t *>(_ctx.at(pad_index).data()->base());
 
@@ -780,13 +777,15 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 {
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+  const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto perm_tensor = _tensor_reg->getPortableTensor(perm_index);
 
   auto fn = std::make_unique<ops::TransposeLayer>();
 
-  fn->configure(input_tensor, output_tensor, node.param().perm);
+  fn->configure(input_tensor, perm_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -798,9 +797,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
   const auto keep_dims = node.param().keep_dims;
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto axes_tensor = _tensor_reg->getPortableTensor(axes_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axes_tensor = _tensor_reg->getPortableTensor(axes_index);
 
   if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
@@ -828,10 +827,10 @@ void KernelGenerator::visit(const ir::operation::Select &node)
   const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
   const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto condition_tensor = _tensor_reg->getPortableTensor(condition_index).get();
-  auto true_tensor = _tensor_reg->getPortableTensor(true_index).get();
-  auto false_tensor = _tensor_reg->getPortableTensor(false_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto condition_tensor = _tensor_reg->getPortableTensor(condition_index);
+  auto true_tensor = _tensor_reg->getPortableTensor(true_index);
+  auto false_tensor = _tensor_reg->getPortableTensor(false_index);
 
   auto fn = std::make_unique<ops::SelectLayer>();
 
@@ -847,10 +846,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto begins_tensor = _tensor_reg->getPortableTensor(begins_index).get();
-  auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto begins_tensor = _tensor_reg->getPortableTensor(begins_index);
+  auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index);
 
   auto fn = std::make_unique<ops::SliceLayer>();
 
@@ -867,11 +866,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto starts_tensor = _tensor_reg->getPortableTensor(starts_index).get();
-  auto ends_tensor = _tensor_reg->getPortableTensor(ends_index).get();
-  auto strides_tensor = _tensor_reg->getPortableTensor(strides_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto starts_tensor = _tensor_reg->getPortableTensor(starts_index);
+  auto ends_tensor = _tensor_reg->getPortableTensor(ends_index);
+  auto strides_tensor = _tensor_reg->getPortableTensor(strides_index);
 
   auto begin_mask = node.param().begin_mask;
   auto end_mask = node.param().end_mask;
@@ -891,19 +890,18 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   assert(num_splits == static_cast<int>(node.getOutputs().size()));
 
   const auto input_idx{node.getInputs().at(ir::operation::Split::Input::INPUT)};
-  const auto rank = _ctx.at(input_idx).shape().rank();
-  const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
-  auto axis_resolved = axis < 0 ? axis + rank : axis;
+  const auto axis_idx{node.getInputs().at(ir::operation::Split::Input::AXIS)};
 
-  auto in_tensor = _tensor_reg->getPortableTensor(input_idx).get();
+  auto in_tensor = _tensor_reg->getPortableTensor(input_idx);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_idx);
 
   std::vector<IPortableTensor *> out_tensors;
   for (auto &output_idx : node.getOutputs())
-    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get());
+    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::SplitLayer>();
 
-  fn->configure(in_tensor, num_splits, axis_resolved, out_tensors);
+  fn->configure(in_tensor, axis_tensor, num_splits, out_tensors);
 
   _return_fn = std::move(fn);
 }
@@ -913,8 +911,8 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
 
   auto fn = std::make_unique<ops::ShapeLayer>();
 
@@ -928,18 +926,37 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::INPUT)};
 
-  auto output_height = node.param().height_out;
-  auto output_width = node.param().width_out;
   auto align_corners = node.param().align_corners;
   auto half_pixel_centers = node.param().half_pixel_centers;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::ResizeBilinearLayer>();
 
-  fn->configure(input_tensor, output_tensor, output_height, output_width, align_corners,
-                half_pixel_centers);
+  if (node.getInputs().size() == 1)
+  {
+    fn->configure(input_tensor, output_tensor, node.param().height_out, node.param().width_out,
+                  align_corners, half_pixel_centers);
+  }
+  else
+  {
+    assert(node.getInputs().size() == 2);
+    const auto size_index{node.getInputs().at(ir::operation::ResizeBilinear::SIZE)};
+    auto size_tensor = _tensor_reg->getPortableTensor(size_index);
+    if (size_tensor->is_constant())
+    {
+      auto size_vec = _ctx.at(size_index).asVector<int32_t>();
+      const auto height_out = size_vec[0];
+      const auto width_out = size_vec[1];
+      fn->configure(input_tensor, output_tensor, height_out, width_out, align_corners,
+                    half_pixel_centers);
+    }
+    else
+    {
+      fn->configure(input_tensor, output_tensor, size_tensor, align_corners, half_pixel_centers);
+    }
+  }
 
   _return_fn = std::move(fn);
 }
@@ -950,9 +967,9 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ReverseLayer>();
 
@@ -965,15 +982,15 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 {
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ArgMax::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::AXIS)};
 
-  const auto axis = node.param().axis;
-
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ArgMinMaxLayer>();
 
-  fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
+  fn->configure(input_tensor, output_tensor, axis_tensor, /* is_arg_max */ true);
 
   _return_fn = std::move(fn);
 }
@@ -992,8 +1009,8 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
 
   auto fn = std::make_unique<ops::PoolLayer>();
 
@@ -1010,9 +1027,9 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::PowLayer>();
 
@@ -1026,8 +1043,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(0)};
 
-  auto output_alloc = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_alloc = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_alloc = _tensor_reg->getPortableTensor(output_index);
+  auto input_alloc = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::L2NormLayer>();
 
@@ -1043,10 +1060,10 @@ void KernelGenerator::visit(const ir::operation::Range &node)
   const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
   const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto start_tensor = _tensor_reg->getPortableTensor(start_index).get();
-  auto limit_tensor = _tensor_reg->getPortableTensor(limit_index).get();
-  auto delta_tensor = _tensor_reg->getPortableTensor(delta_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto start_tensor = _tensor_reg->getPortableTensor(start_index);
+  auto limit_tensor = _tensor_reg->getPortableTensor(limit_index);
+  auto delta_tensor = _tensor_reg->getPortableTensor(delta_index);
 
   auto fn = std::make_unique<ops::RangeLayer>();
 
@@ -1059,8 +1076,8 @@ void KernelGenerator::visit(const ir::operation::Rank &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
 
   auto fn = std::make_unique<ops::RankLayer>();
 
@@ -1075,9 +1092,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::SqDiffLayer>();
 
@@ -1091,9 +1108,9 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
   const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
   const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index);
 
   auto fn = std::make_unique<ops::TileLayer>();
 
@@ -1108,10 +1125,10 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
   const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
   const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index).get();
-  auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index);
+  auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index);
 
   auto fn = std::make_unique<ops::MatrixBandPartLayer>();
 
@@ -1125,9 +1142,9 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
   const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   const auto adj_x = node.param().adj_x;
   const auto adj_y = node.param().adj_y;
@@ -1144,9 +1161,9 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
   const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
   const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto shape_tensor = _tensor_reg->getPortableTensor(shape_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto shape_tensor = _tensor_reg->getPortableTensor(shape_index);
 
   auto fn = std::make_unique<ops::BroadcastToLayer>();
 
@@ -1159,10 +1176,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   const auto epsilon = node.param().epsilon;
   const auto is_training = node.param().is_training;
@@ -1183,8 +1200,8 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
   const auto beta = node.param().beta;
   const auto axis = node.param().axis;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::LogSoftMaxLayer>();
 
@@ -1200,10 +1217,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
   const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index).get();
-  auto padding_tensor = _tensor_reg->getPortableTensor(padding_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index);
+  auto padding_tensor = _tensor_reg->getPortableTensor(padding_index);
 
   auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
 
@@ -1218,8 +1235,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
   const auto output_index{node.getOutputs().at(0)};
   auto block_size = node.param().block_size;
 
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
 
   auto fn = std::make_unique<ops::SpaceToDepthLayer>();
 
@@ -1233,9 +1250,9 @@ void KernelGenerator::visit(const ir::operation::StatelessRandomUniform &node)
   const auto shape_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SHAPE)};
   const auto seed_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SEED)};
 
-  auto output_alloc = _tensor_reg->getPortableTensor(output_index).get();
-  auto shape_alloc = _tensor_reg->getPortableTensor(shape_index).get();
-  auto seed_alloc = _tensor_reg->getPortableTensor(seed_index).get();
+  auto output_alloc = _tensor_reg->getPortableTensor(output_index);
+  auto shape_alloc = _tensor_reg->getPortableTensor(shape_index);
+  auto seed_alloc = _tensor_reg->getPortableTensor(seed_index);
 
   auto fn = std::make_unique<ops::StatelessRandomUniformLayer>();
 
@@ -1252,13 +1269,13 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
   const auto size_splits{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
   const auto split_dim{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
 
-  auto in_tensor = _tensor_reg->getPortableTensor(input_idx).get();
-  auto in_size_splits = _tensor_reg->getPortableTensor(size_splits).get();
-  auto in_split_dim = _tensor_reg->getPortableTensor(split_dim).get();
+  auto in_tensor = _tensor_reg->getPortableTensor(input_idx);
+  auto in_size_splits = _tensor_reg->getPortableTensor(size_splits);
+  auto in_split_dim = _tensor_reg->getPortableTensor(split_dim);
 
   std::vector<IPortableTensor *> out_tensors;
   for (auto &output_idx : node.getOutputs())
-    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get());
+    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::SplitVLayer>();
 
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
index 78c98dabf..3edac897c 100644
--- a/runtime/onert/backend/cpu/StaticTensorManager.cc
+++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
@@ -41,7 +41,7 @@ void StaticTensorManager::allocateNonconsts(void)
   for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
-    auto tensor = pair.second;
+    auto tensor = pair.second.get();
     if (!_as_constants[ind] && !tensor->is_dynamic())
     {
       auto *buffer = _nonconst_mgr->getBuffer(ind);
@@ -62,13 +62,14 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
   assert(!_tensors->getITensor(ind));
   if (as_const)
   {
-    auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
-    _tensors->setNativeTensor(ind, tensor);
+    auto tensor = std::make_unique<ExternalTensor>(tensor_info, backend_layout);
+    _tensors->setNativeTensor(ind, std::move(tensor));
   }
   else
   {
-    auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager);
-    _tensors->setNativeTensor(ind, tensor);
+    auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout,
+                                           _dynamic_tensor_manager->dynamic_mem_mgr().get());
+    _tensors->setNativeTensor(ind, std::move(tensor));
   }
   _as_constants[ind] = as_const;
 }
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensorRegistry.h b/runtime/onert/backend/cpu/Tensor.cc
index fa2a2d54c..dac8f898b 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensorRegistry.h
+++ b/runtime/onert/backend/cpu/Tensor.cc
@@ -14,23 +14,19 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__
-#define __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__
-
-#include "backend/ITensorRegistry.h"
-#include "UserTensor.h"
+#include "Tensor.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace cpu
 {
 
-using UserTensorRegistry = PortableTensorRegistryTemplate<UserTensor>;
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+ExternalTensor::~ExternalTensor() {}
 
-} // namespace controlflow
+} // namespace cpu
 } // namespace backend
 } // namespace onert
-
-#endif // __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index 20e60260c..2ad2ad0fb 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -41,6 +41,7 @@ class ExternalTensor : public Tensor
 {
 public:
   ExternalTensor() = delete;
+  virtual ~ExternalTensor();
 
 public:
   ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout)
@@ -95,6 +96,21 @@ public:
     }
   }
 
+  /**
+   * @brief Reset reference count to zero and release data
+   */
+  void reset_ref() override
+  {
+    assert(_data != nullptr);
+    assert(_num_references > 0);
+    _num_references = 0;
+
+    _data.reset();
+    _buffer = nullptr;
+  }
+
+  int32_t num_references() override { return _num_references; }
+
 private:
   std::shared_ptr<const ir::Data> _data;
 };
diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
index 828d52f7c..e6bc55b0b 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.cc
+++ b/runtime/onert/backend/cpu/TensorBuilder.cc
@@ -85,16 +85,6 @@ void TensorBuilder::allocate()
   //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
 }
 
-std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
-{
-  return std::move(_static_tensor_mgr);
-}
-
-std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
-{
-  return std::move(_dynamic_tensor_mgr);
-}
-
 } // namespace cpu
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index b6d5f09cc..448abc229 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -58,12 +58,8 @@ public:
   void allocate() override;
   void postFunctionPrepare() override { /* DO NOTHING */}
 
-  std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override;
-
   IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); }
 
-  std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) override;
-
 private:
   const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
   std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
index d7b0b2bce..2fd284c91 100644
--- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
@@ -44,24 +44,29 @@ template <typename T> std::function<bool(T, T)> GetComparefunction(bool is_arg_m
 }
 }
 
-void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output, int32_t axis,
-                               bool is_arg_max)
+void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+                               const IPortableTensor *axis, bool is_arg_max)
 {
   _input = input;
   _output = output;
-  if (axis < 0)
-  {
-    axis += input->num_dimensions();
-  }
   _axis = axis;
   _is_arg_max = is_arg_max;
 }
 
 void ArgMinMaxLayer::run()
 {
-#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type)                                 \
-  ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()),     \
-            getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), _axis, \
+  if (_axis->total_size() != sizeof(int32_t))
+  {
+    throw std::runtime_error("ArgMinMax: wrong shape of axis");
+  }
+  auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer());
+  if (axis < 0)
+  {
+    axis += _input->num_dimensions();
+  }
+#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type)                                \
+  ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()),    \
+            getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), axis, \
             GetComparefunction<input_type>(_is_arg_max));
   if (_output->data_type() == ir::DataType::INT32)
   {
diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h
index d7c021624..4c864cb98 100644
--- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h
+++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h
@@ -33,18 +33,18 @@ namespace ops
 class ArgMinMaxLayer : public ::onert::exec::IFunction
 {
 public:
-  ArgMinMaxLayer() : _input(nullptr), _output(nullptr), _axis(-1), _is_arg_max(true) {}
+  ArgMinMaxLayer() : _input(nullptr), _output(nullptr), _axis(nullptr), _is_arg_max(true) {}
 
 public:
-  void configure(const IPortableTensor *indices, IPortableTensor *output, int32_t axis,
-                 bool is_arg_max);
+  void configure(const IPortableTensor *indices, IPortableTensor *output,
+                 const IPortableTensor *axis, bool is_arg_max);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
   IPortableTensor *_output;
-  int32_t _axis;
+  const IPortableTensor *_axis;
   bool _is_arg_max;
 };
 
diff --git a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
index f50c63375..8e51daad5 100644
--- a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
@@ -34,20 +34,21 @@ template <nnfw::cker::BinaryArithmeticOpType arithmetic_type, typename T>
 void eval(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
           nnfw::cker::BinaryArithmeticOpParam op_params)
 {
-  const bool need_broadcast =
-      nnfw::cker::ProcessBroadcastShapes(getTensorShape(lhs), getTensorShape(rhs), &op_params);
+  const auto lhs_shape = getTensorShape(lhs);
+  const auto rhs_shape = getTensorShape(rhs);
+  const bool need_broadcast = nnfw::cker::ProcessBroadcastShapes(lhs_shape, rhs_shape, &op_params);
   if (need_broadcast)
   {
     nnfw::cker::BroadcastBinaryArithmeticOp<arithmetic_type>(
-        op_params, getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-        getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
+        op_params, lhs_shape, reinterpret_cast<const T *>(lhs->buffer()), rhs_shape,
+        reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
         reinterpret_cast<T *>(output->buffer()));
     return;
   }
 
   nnfw::cker::BinaryArithmeticOp<arithmetic_type>(
-      op_params, getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-      getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
+      op_params, lhs_shape, reinterpret_cast<const T *>(lhs->buffer()), rhs_shape,
+      reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
       reinterpret_cast<T *>(output->buffer()));
 }
 
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index 05da33abf..f873a3430 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -158,16 +158,30 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
   op_params.float_activation_max = output_activation_max;
   op_params.activation = convertActivationType(_activation);
 
-  int w0_size = getTensorShape(_weights).Dims(0);
-  const uint16_t *w1_segments = _weights->w1_segments();
-  const uint16_t *w1_indices = _weights->w1_indices();
+  const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
+  const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
 
-  nnfw::cker::FullyConnectedSparseWeight(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments,
-      w1_indices);
+  auto block_size = _weights->sparsity()->block_size();
+  if (block_size.size() == 0)
+  {
+    nnfw::cker::FullyConnectedSparseWeightRandom(
+        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+        getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+        getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments,
+        w1_indices);
+  }
+  else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
+  {
+    nnfw::cker::FullyConnectedSparseWeight16x1(
+        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+        getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+        getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments,
+        w1_indices);
+  }
+  else
+    throw std::runtime_error{"FullyConnected: unsupported sparsity"};
 }
 
 void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
@@ -191,7 +205,7 @@ void FullyConnectedLayer::run()
   {
     fullyConnectedHybrid();
   }
-  else if (_weights->is_sparse())
+  else if (_weights->sparsity())
   {
     fullyConnectedSparseWeight();
   }
@@ -239,17 +253,11 @@ void FullyConnectedLayer::prepare()
   const int rows = getTensorShape(_weights).Dims(0);
   if (rows % 4 == 0)
   {
-    const int total_input_size = getTensorShape(_input).FlatSize();
-    const int input_size = getTensorShape(_weights).Dims(1);
-    const int batch_size = total_input_size / input_size;
-    if (batch_size <= 4)
-    {
-      // TODO If it's possible to extract precaching from ruy kernel,
-      // place this instead of below code
+    // TODO If it's possible to extract precaching from ruy kernel,
+    // place this instead of below code
 
-      // buffer will be used by ruy kernel as a cache key
-      _cached_weights = _weights->buffer();
-    }
+    // buffer will be used by ruy kernel as a cache key
+    _cached_weights = _weights->buffer();
   }
 #endif
 }
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
index 98385521a..eb24dd43c 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
@@ -95,27 +95,18 @@ inline nnfw::cker::Shape getTensorShape(const IPortableTensor *tensor)
   if (tensor == nullptr)
     return nnfw::cker::Shape();
 
+  const ir::Shape &shape = tensor->get_info().shape();
+
   assert(tensor->layout() == ir::Layout::NHWC);
-  constexpr int kMaxSmallSize = 8;
-  int32_t raw_shape_small[kMaxSmallSize];
-  std::vector<int32_t> raw_shape_vec;
-  auto rank = tensor->num_dimensions();
-  int32_t *data = nullptr;
-  if (rank > kMaxSmallSize)
-  {
-    raw_shape_vec.resize(rank);
-    data = raw_shape_vec.data();
-  }
-  else
-  {
-    data = raw_shape_small;
-  }
 
-  for (uint32_t i = 0; i < rank; ++i)
+  auto rank = shape.rank();
+  nnfw::cker::Shape ret(rank);
+  auto data = ret.DimsData();
+  for (int i = 0; i < rank; ++i)
   {
-    data[i] = tensor->dimension(i);
+    data[i] = shape.dim(i);
   }
-  return nnfw::cker::Shape(rank, data);
+  return ret;
 }
 
 inline nnfw::cker::FusedActivationFunctionType
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.cc b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
index bb5f85d60..4a55b2a33 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
@@ -18,6 +18,7 @@
 
 #include "OperationUtils.h"
 
+#include "cker/neon/neon_check.h"
 #include <cker/operation/Reduce.h>
 
 namespace onert
@@ -158,7 +159,7 @@ void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output,
 
 ReduceLayer::ReduceLayer()
     : _input(nullptr), _axes(nullptr), _output(nullptr), _reduce_kernel(new nnfw::cker::Reduce()),
-      _kernel()
+      _kernel(), _reduceType(ReduceType::kInvalid)
 {
   // DO NOTHING
 }
@@ -171,8 +172,9 @@ void ReduceLayer::configure(const IPortableTensor *input, const IPortableTensor
   _input = input;
   _axes = axes;
   _output = output;
+  _reduceType = reduceType;
 
-  switch (reduceType)
+  switch (_reduceType)
   {
     case ReduceType::kSum:
       if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
@@ -199,13 +201,23 @@ void ReduceLayer::configure(const IPortableTensor *input, const IPortableTensor
       _kernel = generateKernelGeneric(_input, keep_dims, *_reduce_kernel, ReduceType::kAll);
       break;
     default:
-      throw std::runtime_error{"ReduceSum: Unsupported reduce type"};
+      throw std::runtime_error{"Reduce: Unsupported reduce type"};
   }
 }
 
 void ReduceLayer::run()
 {
   const auto axes = getReducerAxes(_axes);
+#ifdef USE_NEON
+  int32_t rank = _input->num_dimensions();
+  if (_input->data_type() == ir::DataType::FLOAT32 && _reduceType == ReduceType::kSum &&
+      axes.size() == 1 && (axes[0] == -1 || axes[0] == rank - 1))
+  {
+    OptimizedReduceSum(reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_input),
+                       reinterpret_cast<float *>(_output->buffer()));
+    return;
+  }
+#endif // NEON
   _kernel(_input, _output, axes);
 }
 
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.h b/runtime/onert/backend/cpu/ops/ReduceLayer.h
index 332d399bd..8265dd41f 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.h
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.h
@@ -17,6 +17,8 @@
 #ifndef __ONERT_BACKEND_CPU_OPS_REDUCESUMLAYER_H__
 #define __ONERT_BACKEND_CPU_OPS_REDUCESUMLAYER_H__
 
+#include "cker/neon/neon_check.h"
+
 #include <backend/IPortableTensor.h>
 
 #include <exec/IFunction.h>
@@ -47,6 +49,7 @@ enum class ReduceType
   kMin,
   kAny,
   kAll,
+  kInvalid // For debug and initialize
 };
 
 class ReduceLayer : public ::onert::exec::IFunction
@@ -70,6 +73,8 @@ private:
   std::function<void(const IPortableTensor *input, IPortableTensor *output,
                      const std::vector<int> &axes)>
       _kernel;
+
+  ReduceType _reduceType;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
index 180094bb8..1fe56cb99 100644
--- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
@@ -28,16 +28,39 @@ namespace ops
 {
 
 ResizeBilinearLayer::ResizeBilinearLayer()
-    : _input(nullptr), _output(nullptr), _output_height(0), _output_width(0), _align_corners(false),
-      _half_pixel_centers(false)
+    : _input(nullptr), _output(nullptr), _size(nullptr), _output_height(0), _output_width(0),
+      _align_corners(false), _half_pixel_centers(false)
 {
   // DO NOTHING
 }
 
 void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+                                    const IPortableTensor *size, bool align_corners,
+                                    bool half_pixel_centers)
+{
+  assert(!size->is_constant());
+  _input = input;
+  _output = output;
+  _size = size;
+  _align_corners = align_corners;
+  _half_pixel_centers = half_pixel_centers;
+}
+
+void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output,
                                     int32_t output_height, int32_t output_width, bool align_corners,
                                     bool half_pixel_centers)
 {
+  assert(_size == nullptr);
+  if (output_height < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_height = " +
+                             std::to_string(output_height)};
+  }
+  if (output_width < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_width = " +
+                             std::to_string(output_width)};
+  }
   _input = input;
   _output = output;
   _output_height = output_height;
@@ -49,10 +72,19 @@ void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTenso
 void ResizeBilinearLayer::run()
 {
   nnfw::cker::ResizeBilinearParams params;
+  if (_size == nullptr)
+  {
+    params.output_height = _output_height;
+    params.output_width = _output_width;
+  }
+  else
+  {
+    const auto size_buf = reinterpret_cast<const int32_t *>(_size->buffer());
+    params.output_height = size_buf[0];
+    params.output_width = size_buf[1];
+  }
   params.align_corners = _align_corners;
   params.half_pixel_centers = _half_pixel_centers;
-  params.output_height = _output_height;
-  params.output_width = _output_width;
 
   switch (_input->data_type())
   {
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
index fc49b348e..d7ae1c620 100644
--- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
@@ -36,7 +36,10 @@ public:
   ResizeBilinearLayer();
 
 public:
-  void configure(const IPortableTensor *input1, IPortableTensor *output, int32_t output_height,
+  void configure(const IPortableTensor *input1, IPortableTensor *output,
+                 const IPortableTensor *size, bool align_corners, bool half_pixel_centers);
+
+  void configure(const IPortableTensor *input, IPortableTensor *output, int32_t output_height,
                  int32_t output_width, bool align_corners, bool half_pixel_centers);
 
   void run() override;
@@ -44,6 +47,7 @@ public:
 private:
   const IPortableTensor *_input;
   IPortableTensor *_output;
+  const IPortableTensor *_size;
   int32_t _output_height;
   int32_t _output_width;
   bool _align_corners;
diff --git a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
index 095e67abc..b42be3042 100644
--- a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
@@ -62,7 +62,11 @@ void SoftMaxLayer::softmaxFloat32()
   }
   else
   {
-    throw std::runtime_error{"only 1D, 2D and 4D tensors supported"};
+    nnfw::cker::SoftmaxParams op_params;
+    op_params.beta = _beta;
+    nnfw::cker::reference::Softmax(
+        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.cc b/runtime/onert/backend/cpu/ops/SplitLayer.cc
index 1f40654c1..922cde2e3 100644
--- a/runtime/onert/backend/cpu/ops/SplitLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SplitLayer.cc
@@ -29,7 +29,7 @@ namespace cpu
 namespace ops
 {
 
-SplitLayer::SplitLayer() : _input(nullptr), _num_splits(0), _axis(0), _outputs()
+SplitLayer::SplitLayer() : _input(nullptr), _axis(nullptr), _num_splits(0), _outputs()
 {
   // DO NOTHING
 }
@@ -37,7 +37,16 @@ SplitLayer::SplitLayer() : _input(nullptr), _num_splits(0), _axis(0), _outputs()
 template <typename T> void SplitLayer::split(void)
 {
   nnfw::cker::SplitParams op_params;
-  op_params.axis = _axis;
+  if (_axis->total_size() != sizeof(int32_t))
+  {
+    throw std::runtime_error("ArgMinMax: wrong shape of axis");
+  }
+  auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer());
+  if (axis < 0)
+  {
+    axis += _input->num_dimensions();
+  }
+  op_params.axis = axis;
   op_params.num_split = _num_splits;
 
   std::vector<T *> outputPtrs;
@@ -53,8 +62,8 @@ template <typename T> void SplitLayer::split(void)
                        getTensorShape(_outputs[0]), outputPtrs.data());
 }
 
-void SplitLayer::configure(const IPortableTensor *input, uint16_t num_splits, int16_t axis,
-                           std::vector<IPortableTensor *> &outputs)
+void SplitLayer::configure(const IPortableTensor *input, const IPortableTensor *axis,
+                           uint16_t num_splits, std::vector<IPortableTensor *> &outputs)
 {
   assert(input != nullptr);
 
diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.h b/runtime/onert/backend/cpu/ops/SplitLayer.h
index 0719a0063..090f87166 100644
--- a/runtime/onert/backend/cpu/ops/SplitLayer.h
+++ b/runtime/onert/backend/cpu/ops/SplitLayer.h
@@ -38,15 +38,15 @@ public:
 public:
   template <typename T> void split(void);
 
-  void configure(const IPortableTensor *input, uint16_t num_splits, int16_t axis,
+  void configure(const IPortableTensor *input, const IPortableTensor *axis, uint16_t num_splits,
                  std::vector<IPortableTensor *> &outputs);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
+  const IPortableTensor *_axis;
   uint16_t _num_splits;
-  int16_t _axis;
   std::vector<IPortableTensor *> _outputs;
 };
 
diff --git a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
index dcbb87734..f77f4d691 100644
--- a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
@@ -37,17 +37,17 @@ StridedSliceLayer::StridedSliceLayer()
 
 template <typename T> void StridedSliceLayer::stridedSliceImpl()
 {
+  const auto input_shape = getTensorShape(_input);
+  const auto output_shape = getTensorShape(_output);
   auto op_params = nnfw::cker::buildStridedSliceParams(
       reinterpret_cast<uint32_t *>(_begin->buffer()), reinterpret_cast<uint32_t *>(_end->buffer()),
       reinterpret_cast<uint32_t *>(_strides->buffer()), _begin_mask, _end_mask, _shrink_axis_mask,
-      getTensorShape(_input).DimensionsCount());
+      input_shape.DimensionsCount());
 
-  nnfw::cker::checkOutputSize(op_params, getTensorShape(_input), getTensorShape(_output),
-                              getTensorShape(_input).DimensionsCount());
+  nnfw::cker::checkOutputSize(op_params, input_shape, output_shape, input_shape.DimensionsCount());
 
-  nnfw::cker::StridedSlice(op_params, getTensorShape(_input),
-                           reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
-                           reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::StridedSlice(op_params, input_shape, reinterpret_cast<const T *>(_input->buffer()),
+                           output_shape, reinterpret_cast<T *>(_output->buffer()));
 }
 
 void StridedSliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.cc b/runtime/onert/backend/cpu/ops/TransposeLayer.cc
index 7b232562a..3362c3396 100644
--- a/runtime/onert/backend/cpu/ops/TransposeLayer.cc
+++ b/runtime/onert/backend/cpu/ops/TransposeLayer.cc
@@ -19,6 +19,7 @@
 #include "OperationUtils.h"
 
 #include <cker/operation/Transpose.h>
+#include <numeric>
 
 namespace onert
 {
@@ -29,7 +30,7 @@ namespace cpu
 namespace ops
 {
 
-TransposeLayer::TransposeLayer() : _input(nullptr), _output(nullptr), _perm()
+TransposeLayer::TransposeLayer() : _input(nullptr), _perm(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
@@ -37,10 +38,23 @@ TransposeLayer::TransposeLayer() : _input(nullptr), _output(nullptr), _perm()
 template <typename T> void TransposeLayer::transpose()
 {
   nnfw::cker::TransposeParams param;
-  param.perm_count = _perm.size();
-  for (size_t i = 0; i < _perm.size(); i++)
+  assert(_perm->num_dimensions() == 1);
+
+  param.perm_count = _input->num_dimensions();
+  if (_perm->dimension(0) == 0) // This means _perm is (n-1...0)
+  {
+    const auto begin = param.perm;
+    const auto end = param.perm + _input->num_dimensions();
+    std::iota(begin, end, 0);
+    std::reverse(begin, end);
+  }
+  else
   {
-    param.perm[i] = _perm[i];
+    assert(param.perm_count == static_cast<int>(_perm->dimension(0)));
+    for (auto i = 0; i < param.perm_count; i++)
+    {
+      param.perm[i] = *(reinterpret_cast<const int32_t *>(_perm->buffer()) + i);
+    }
   }
 
   nnfw::cker::Transpose(param, getTensorShape(_input),
@@ -63,8 +77,8 @@ void TransposeLayer::transposeQuant8()
   transpose<uint8_t>();
 }
 
-void TransposeLayer::configure(const IPortableTensor *input, IPortableTensor *output,
-                               const std::vector<int> &perm)
+void TransposeLayer::configure(const IPortableTensor *input, const IPortableTensor *perm,
+                               IPortableTensor *output)
 {
   _input = input;
   _perm = perm;
diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.h b/runtime/onert/backend/cpu/ops/TransposeLayer.h
index f9cb12770..c8e9f8ae7 100644
--- a/runtime/onert/backend/cpu/ops/TransposeLayer.h
+++ b/runtime/onert/backend/cpu/ops/TransposeLayer.h
@@ -40,15 +40,15 @@ public:
 
   void transposeQuant8();
 
-  void configure(const IPortableTensor *input, IPortableTensor *output,
-                 const std::vector<int> &perm);
+  void configure(const IPortableTensor *input, const IPortableTensor *perm,
+                 IPortableTensor *output);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
+  const IPortableTensor *_perm;
   IPortableTensor *_output;
-  std::vector<int> _perm;
 };
 
 } // namespace ops
diff --git a/runtime/onert/core/CMakeLists.txt b/runtime/onert/core/CMakeLists.txt
index d58b47ced..344b2a972 100644
--- a/runtime/onert/core/CMakeLists.txt
+++ b/runtime/onert/core/CMakeLists.txt
@@ -13,6 +13,11 @@ target_link_libraries(onert_core PRIVATE nnfw_coverage)
 target_link_libraries(onert_core PRIVATE dl ${LIB_PTHREAD})
 target_link_libraries(onert_core PRIVATE jsoncpp)
 target_link_libraries(onert_core INTERFACE ruy_instrumentation)
+# NOTE Below line is added to remove warning for android build
+#      It will be removed after android build uses gold linker
+if (ANDROID)
+  target_link_libraries(onert_core INTERFACE log)
+endif (ANDROID)
 
 if(ENVVAR_ONERT_CONFIG)
   target_compile_definitions(onert_core PRIVATE ENVVAR_FOR_DEFAULT_CONFIG)
diff --git a/runtime/onert/core/include/backend/CustomKernelBuilder.h b/runtime/onert/core/include/backend/CustomKernelBuilder.h
index 101272135..cae2fc1a3 100644
--- a/runtime/onert/core/include/backend/CustomKernelBuilder.h
+++ b/runtime/onert/core/include/backend/CustomKernelBuilder.h
@@ -49,10 +49,10 @@ struct TypeInfo
 
 struct CustomKernelConfigParams
 {
-  std::vector<std::shared_ptr<backend::IPortableTensor>> input_tensors;
+  std::vector<backend::IPortableTensor *> input_tensors;
   std::vector<TypeInfo> input_types;
 
-  std::vector<std::shared_ptr<backend::IPortableTensor>> output_tensors;
+  std::vector<backend::IPortableTensor *> output_tensors;
   std::vector<TypeInfo> output_types;
 
   char *userdata;
diff --git a/runtime/onert/core/include/backend/IDynamicTensorManager.h b/runtime/onert/core/include/backend/IDynamicTensorManager.h
index 343c52c4a..67cfda24e 100644
--- a/runtime/onert/core/include/backend/IDynamicTensorManager.h
+++ b/runtime/onert/core/include/backend/IDynamicTensorManager.h
@@ -39,24 +39,12 @@ struct IDynamicTensorManager : public ITensorManager
 
 public:
   /**
-   * @brief Set new shape and allocate memory for dynamic tensor.
-   *        If a tensor is dynamic tensor and previously allocated memory exists,
-   *        it will be deallocated.
-   *        If a tensor is static tensor (with previously allocated memory by StaticTensorManager),
-   *        tensor->buffer() will be overwrite to the dynamically allocated memory
-   * @param ind             operand index of a tensor
-   * @param new_shape       tensor's new shape. While allocating memory for this new_shape,
-   *                        tensor's shape is set to new_shape
-   */
-  virtual void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) = 0;
-
-  /**
    * @brief Plan when to delete a tensor. Note this planning is done at compilation time.
    * @param op_ind        operation index
-   * @param operand_ind   operand index of input operand of first param op. Operand can be static
+   * @param tensor        candidate ITensor to dealloc. Tensor can be static
    *                      or dynamic since tensor type may not be clearly known at compilation time.
    */
-  virtual void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) = 0;
+  virtual void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) = 0;
 
   /**
    * @brief Deallocate input tensors of op if an input tensor is a dynamic tensor and it won't
@@ -64,12 +52,6 @@ public:
    * @note  This will work after calling planDealloc
    */
   virtual void deallocInput(ir::OperationIndex op_ind) = 0;
-
-  /**
-   * @brief Deallocate an output tensor if the tensor is a dynamic tensor
-   * @note  This will work after calling planDealloc
-   */
-  virtual void deallocSubgraphOutput(ir::OperandIndex ind) = 0;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/IPortableTensor.h b/runtime/onert/core/include/backend/IPortableTensor.h
index a05b39a33..1b1f05fe1 100644
--- a/runtime/onert/core/include/backend/IPortableTensor.h
+++ b/runtime/onert/core/include/backend/IPortableTensor.h
@@ -18,6 +18,8 @@
 #define __ONERT_BACKEND_I_PORTABLE_TENSOR_H__
 
 #include "backend/ITensor.h"
+#include "ir/OperandInfo.h"
+#include "ir/Sparsity.h"
 
 namespace onert
 {
@@ -36,14 +38,18 @@ namespace backend
 class IPortableTensor : public ITensor
 {
 public:
-  virtual ~IPortableTensor() = default;
-  virtual bool is_sparse() const { return false; }
-  virtual const uint16_t *w1_segments() const { return nullptr; }
-  virtual const uint16_t *w1_indices() const { return nullptr; }
+  IPortableTensor(const ir::OperandInfo &info) : _info(info) {}
+
+  virtual ~IPortableTensor();
+  virtual const ir::Sparsity *sparsity() const { return nullptr; }
+  const ir::OperandInfo &get_info() const { return _info; }
 
 public:
   bool has_padding() const final { return false; }
   void access(const std::function<void(ITensor &tensor)> &fn) final { fn(*this); }
+
+protected:
+  ir::OperandInfo _info;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/ITensor.h b/runtime/onert/core/include/backend/ITensor.h
index 12b1c5433..b18dd30a2 100644
--- a/runtime/onert/core/include/backend/ITensor.h
+++ b/runtime/onert/core/include/backend/ITensor.h
@@ -53,13 +53,19 @@ public:
   virtual void access(const std::function<void(ITensor &tensor)> &fn) = 0;
 
   /**
-   * @brief Return the dynamic tensor manager
+   * @brief Set the shape to @c shape and possibly re-allocate the buffer
    *
-   * If dynamic tensors are not supported, it returns @c nullptr .
+   * If a tensor is dynamic tensor and previously allocated memory exists,
+   * it will be deallocated.
+   * If a tensor is static tensor (with previously allocated memory by StaticTensorManager),
+   * @c buffer() will be overwriten
    *
-   * @return IDynamicTensorManager* DynamicTensorManager
+   * @param shape tensor's new shape. While allocating memory for this new_shape,
+   *              tensor's shape is set to new_shape
+   * @return true If applying shape is successful
+   * @return false If not applying shape is not supported (it throws for other errors)
    */
-  virtual IDynamicTensorManager *dynamic_tensor_manager() { return nullptr; }
+  virtual bool applyShape(const ir::Shape &) { return false; }
 
   /**
    * @brief Return true if the tensor is constant
diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
index f93ab81ae..97721cf19 100644
--- a/runtime/onert/core/include/backend/ITensorBuilder.h
+++ b/runtime/onert/core/include/backend/ITensorBuilder.h
@@ -89,14 +89,6 @@ public: // methods for static tensor allocation
    */
   virtual void postFunctionPrepare() = 0;
 
-  /**
-   * @brief Release static @c ITensorManger object which was built
-   *        Before calling this, @c allocate must have been called
-   *
-   * @return std::unique_ptr<ITensorManager> Tensor Manager object
-   */
-  virtual std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) = 0;
-
 public: // methods for dynamic tensor allocation
   /**
    * @brief Get dynamicTensorManager. If a backend does not support dynamic tensor, exception
@@ -108,14 +100,6 @@ public: // methods for dynamic tensor allocation
    *         to the end of execution
    */
   virtual IDynamicTensorManager *dynamicTensorManager(void) { return nullptr; }
-
-  /**
-   * @brief Release dynamic @c ITensorManger object which was built
-   *        Before calling this, @c allocate must have been called
-   *
-   * @return std::unique_ptr<ITensorManager> Tensor Manager object
-   */
-  virtual std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) { return nullptr; }
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
index 88fcb0fcd..b256a1fb8 100644
--- a/runtime/onert/core/include/backend/ITensorRegistry.h
+++ b/runtime/onert/core/include/backend/ITensorRegistry.h
@@ -43,7 +43,7 @@ struct ITensorRegistry
    *
    * @note  Return tensor cannot be used longer than dynamic tensor manager
    */
-  virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
+  virtual ITensor *getITensor(const ir::OperandIndex &) = 0;
   /**
    * @brief Returns pointer of ITensor among native tensors
    *
@@ -51,17 +51,14 @@ struct ITensorRegistry
    *
    * @note  Returned tensor cannot be used longer than dynamic tensor manager
    */
-  virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
+  virtual ITensor *getNativeITensor(const ir::OperandIndex &) = 0;
   /**
    * @brief Set the Migrant Tensor which are from other backends
    *
    * @return true if supported
    * @return false if not supported
    */
-  virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
-  {
-    return false;
-  }
+  virtual bool setMigrantTensor(const ir::OperandIndex &, IPortableTensor *) { return false; }
 };
 
 } // namespace backend
@@ -85,41 +82,37 @@ namespace backend
 template <typename T_Tensor> class PortableTensorRegistryTemplate : public ITensorRegistry
 {
 public:
-  std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
+  ITensor *getITensor(const ir::OperandIndex &ind) override
   {
     static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
-    auto external_tensor = _migrant.find(ind);
-    if (external_tensor != _migrant.end())
-      return external_tensor->second;
+    auto _migrant_tensor = _migrant.find(ind);
+    if (_migrant_tensor != _migrant.end())
+      return _migrant_tensor->second;
     return getNativeTensor(ind);
   }
 
-  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
-  {
-    return getNativeTensor(ind);
-  }
+  ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getNativeTensor(ind); }
 
-  std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
+  IPortableTensor *getPortableTensor(const ir::OperandIndex &ind)
   {
-    auto external_tensor = _migrant.find(ind);
-    if (external_tensor != _migrant.end())
+    auto _migrant_tensor = _migrant.find(ind);
+    if (_migrant_tensor != _migrant.end())
     {
-      if (external_tensor->second)
-        return external_tensor->second;
+      if (_migrant_tensor->second)
+        return _migrant_tensor->second;
     }
     return getNativeTensor(ind);
   }
 
-  std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
+  T_Tensor *getNativeTensor(const ir::OperandIndex &ind)
   {
     auto tensor = _native.find(ind);
     if (tensor != _native.end())
-      return tensor->second;
+      return tensor->second.get();
     return nullptr;
   }
 
-  bool setMigrantTensor(const ir::OperandIndex &ind,
-                        const std::shared_ptr<IPortableTensor> &tensor) override
+  bool setMigrantTensor(const ir::OperandIndex &ind, IPortableTensor *tensor) override
   {
     assert(tensor != nullptr);
     auto itr = _native.find(ind);
@@ -129,25 +122,22 @@ public:
     return true;
   }
 
-  void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+  void setNativeTensor(const ir::OperandIndex &ind, std::unique_ptr<T_Tensor> &&tensor)
   {
     assert(tensor != nullptr);
     auto itr = _migrant.find(ind);
     if (itr != _migrant.end())
       throw std::runtime_error{"Tried to set a native tensor but a migrant tensor already exists."};
-    _native[ind] = tensor;
+    _native[ind] = std::move(tensor);
   }
 
-  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
+  const ir::OperandIndexMap<std::unique_ptr<T_Tensor>> &native_tensors() { return _native; }
 
-  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
-  {
-    return _migrant;
-  }
+  const ir::OperandIndexMap<IPortableTensor *> &migrant_tensors() { return _migrant; }
 
 private:
-  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
-  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
+  ir::OperandIndexMap<IPortableTensor *> _migrant;
+  ir::OperandIndexMap<std::unique_ptr<T_Tensor>> _native;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h b/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
index e3c8c8666..c4e06aa82 100644
--- a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
@@ -44,14 +44,16 @@ public:
 
   virtual ~DynamicTensorManager() = default;
 
-  void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) override;
-
   void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
                    ir::Layout backend_layout);
 
-  void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) override;
+  void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override;
   void deallocInput(ir::OperationIndex op_ind) override;
-  void deallocSubgraphOutput(ir::OperandIndex ind) override;
+
+  std::shared_ptr<DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; }
+
+private:
+  const ITensor *getRawITensor(ir::OperandIndex ind);
 
 private:
   /**
@@ -63,7 +65,8 @@ private:
 
   // contains list of dynamic tensor index, which can be deallocated after running operation
   // note: this map could contain static tensor index too. Careful use is required.
-  std::unordered_map<ir::OperationIndex, std::unordered_set<ir::OperandIndex>> _dealloc_tensor_map;
+  std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>>
+      _dealloc_tensor_map;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h b/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
index 4be7a1a11..28ec6b803 100644
--- a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
@@ -20,12 +20,14 @@
 #include "Allocator.h"
 #include "backend/IMemoryManager.h"
 #include "IMemoryPlanner.h"
-#include "ir/OperandIndexMap.h"
 
 namespace onert
 {
 namespace backend
 {
+
+class ITensor;
+
 namespace cpu_common
 {
 
@@ -59,12 +61,12 @@ public:
   DynamicMemoryManager() = default;
   virtual ~DynamicMemoryManager() = default;
 
-  std::shared_ptr<Allocator> allocate(const ir::OperandIndex &ind, uint32_t capacity);
-  void deallocate(const ir::OperandIndex &ind);
+  std::shared_ptr<Allocator> allocate(const ITensor *tensor, uint32_t capacity);
+  void deallocate(const ITensor *tensor);
   void deallocate(void);
 
 private:
-  ir::OperandIndexMap<std::shared_ptr<Allocator>> _mem_alloc_map;
+  std::unordered_map<const ITensor *, std::shared_ptr<Allocator>> _mem_alloc_map;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
index 3f09b7a4a..fa50b551e 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
@@ -20,7 +20,6 @@
 #include "MemoryManager.h"
 
 #include "backend/IStaticTensorManager.h"
-#include "backend/IDynamicTensorManager.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "TensorRegistry.h"
@@ -32,11 +31,13 @@ namespace backend
 namespace cpu_common
 {
 
+class DynamicTensorManager;
+
 class StaticTensorManager : public backend::IStaticTensorManager
 {
 public:
   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
-                      IDynamicTensorManager *dynamic_tensor_manager);
+                      DynamicMemoryManager *dynamic_mem_mgr);
   virtual ~StaticTensorManager() = default;
 
   void allocateConsts(void);
@@ -57,7 +58,7 @@ private:
   std::unique_ptr<MemoryManager> _nonconst_mgr;
   const std::shared_ptr<TensorRegistry> _tensors;
   ir::OperandIndexMap<bool> _as_constants;
-  IDynamicTensorManager *_dynamic_tensor_manager;
+  DynamicMemoryManager *_dynamic_mem_mgr;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/cpu_common/Tensor.h
index 974501ecb..5fa20e15d 100644
--- a/runtime/onert/core/include/backend/cpu_common/Tensor.h
+++ b/runtime/onert/core/include/backend/cpu_common/Tensor.h
@@ -29,16 +29,19 @@ namespace backend
 namespace cpu_common
 {
 
+class DynamicMemoryManager;
+
 class Tensor : public IPortableTensor
 {
 public:
   Tensor() = delete;
+  virtual ~Tensor();
 
 public:
   Tensor(const ir::OperandInfo &info, const ir::Layout layout,
-         IDynamicTensorManager *dynamic_tensor_manager)
-      : _info(info), _layout(layout), _buffer(nullptr), _num_references(0),
-        _dynamic_tensor_manager(dynamic_tensor_manager), _allocator(nullptr)
+         DynamicMemoryManager *dynamic_mem_mgr)
+      : IPortableTensor(info), _layout(layout), _buffer(nullptr), _num_references(0),
+        _dynamic_mem_mgr(dynamic_mem_mgr), _allocator(nullptr)
   {
     // DO NOTHING
   }
@@ -94,7 +97,7 @@ public:
    *       W : dimension(2)
    *       C : dimension(3)
    */
-  size_t dimension(size_t index) const override { return _info.shape().dim(index); }
+  size_t dimension(size_t index) const final override { return _info.shape().dim(index); }
   size_t num_dimensions() const override { return _info.shape().rank(); }
   size_t total_size() const override { return _info.total_size(); }
   size_t calcOffset(const ir::Coordinates &coords) const override;
@@ -105,10 +108,8 @@ public:
   bool is_constant() const override { return _info.isConstant(); }
   bool is_dynamic() const override { return _info.isDynamic(); }
   void set_dynamic() override { _info.setDynamic(); }
-  IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; }
-  bool is_sparse() const override { return _info.typeInfo().sparse(); }
-  virtual const uint16_t *w1_segments() const override { return _info.typeInfo().w1_segments(); }
-  virtual const uint16_t *w1_indices() const override { return _info.typeInfo().w1_indices(); }
+  bool applyShape(const ir::Shape &new_shape) override;
+  const ir::Sparsity *sparsity() const override { return _info.typeInfo().sparsity(); }
 
   virtual void increase_ref()
   {
@@ -118,6 +119,7 @@ public:
 
     ++_num_references;
   }
+
   virtual void decrease_ref()
   {
     assert(_buffer != nullptr || _allocator != nullptr);
@@ -136,14 +138,34 @@ public:
     }
   }
 
+  /**
+   * @brief Reset reference count to zero and release data
+   */
+  virtual void reset_ref()
+  {
+    assert(_buffer != nullptr || _allocator != nullptr);
+    assert(_num_references > 0);
+    _num_references = 0;
+
+    // Only constant tensor has allocator pointer
+    if (_buffer != nullptr)
+      _buffer = nullptr;
+    else
+    {
+      _allocator->release();
+      _allocator = nullptr;
+    }
+  }
+
+  virtual int32_t num_references() { return _num_references; }
+
   void setShape(const ir::Shape &new_shape) override;
 
 protected:
-  ir::OperandInfo _info;
   ir::Layout _layout;
   uint8_t *_buffer;
   int32_t _num_references;
-  IDynamicTensorManager *_dynamic_tensor_manager;
+  DynamicMemoryManager *_dynamic_mem_mgr;
 
 private:
   /**
diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
index b97cb5b7b..5af11074e 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
@@ -70,6 +70,8 @@ private:
   // TODO Define visitors for operations. List them in alphabetic order.
   void visit(const ir::operation::ArgMax &op) override;
   void visit(const ir::operation::BatchMatMul &op) override;
+  void visit(const ir::operation::BCQFullyConnected &op) override;
+  void visit(const ir::operation::BCQGather &op) override;
   void visit(const ir::operation::BinaryArithmetic &op) override;
   void visit(const ir::operation::BroadcastTo &op) override;
   void visit(const ir::operation::Comparison &op) override;
@@ -85,6 +87,7 @@ private:
   void visit(const ir::operation::Gather &op) override;
   void visit(const ir::operation::If &op) override;
   void visit(const ir::operation::L2Normalization &op) override;
+  void visit(const ir::operation::LSTM &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::OneHot &op) override;
   void visit(const ir::operation::Pack &op) override;
diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
index 6f6659659..4a86708d0 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
@@ -51,6 +51,8 @@ public:
   // Remove TODO when any op starting from the alphabet is added
   void visit(const ir::operation::ArgMax &op) override;
   void visit(const ir::operation::BatchMatMul &op) override;
+  void visit(const ir::operation::BCQFullyConnected &op) override;
+  void visit(const ir::operation::BCQGather &op) override;
   void visit(const ir::operation::BinaryArithmetic &op) override;
   void visit(const ir::operation::BroadcastTo &op) override;
   void visit(const ir::operation::Comparison &op) override;
@@ -65,6 +67,7 @@ public:
   void visit(const ir::operation::FusedBatchNorm &op) override;
   void visit(const ir::operation::Gather &op) override;
   void visit(const ir::operation::L2Normalization &op) override;
+  void visit(const ir::operation::LSTM &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::OneHot &op) override;
   void visit(const ir::operation::Pack &op) override;
diff --git a/runtime/onert/core/include/exec/FunctionSequence.h b/runtime/onert/core/include/exec/FunctionSequence.h
index 79a58ed00..49f00dba1 100644
--- a/runtime/onert/core/include/exec/FunctionSequence.h
+++ b/runtime/onert/core/include/exec/FunctionSequence.h
@@ -79,7 +79,6 @@ public: // methods related to dynamic tensor
     const ir::OpSequence *op_seq = nullptr;
     const ir::Operations *operations = nullptr;
     std::shared_ptr<exec::DynamicShapeInferer> dynamic_shape_inferer = nullptr;
-    std::shared_ptr<backend::ITensorRegistry> tensor_registry = nullptr;
     backend::IDynamicTensorManager *dynamic_tensor_manager = nullptr;
   };
 
@@ -104,14 +103,25 @@ public: // methods related to dynamic tensor
    */
   void enableDynamicShapeInferer(bool enable)
   {
-    _enable_dynamic_shape_inferer = _enable_dynamic_shape_inferer && enable;
+    _enable_dynamic_shape_inferer = _enable_dynamic_shape_inferer || enable;
   }
 
+  /**
+   * @brief Call this function to initialize vars before running
+   * @note When we run a model with static tensor input and then run with dynamic tensor input,
+   *       _enable_dynamic_shape_inferer is set to @c false at first run.
+   *       Once _enable_dynamic_shape_inferer is set to @c true it cannot be changed to @c false
+   *       only with calling enableDynamicShapeInferer(). So initializing it to @c false is
+   *       necessary.
+   * @todo This is a quick fix. Adding this will increase time for run(). Find way to optimize.
+   */
+  void initRunning() { _enable_dynamic_shape_inferer = false; }
+
 protected:
   std::vector<std::unique_ptr<IFunction>> _functions;
 
 protected:
-  bool _enable_dynamic_shape_inferer = true;
+  bool _enable_dynamic_shape_inferer = false;
 
   std::shared_ptr<DynamicTensorCtx> _dynamic_tensor_ctx = nullptr;
 };
diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h
index 6c8bab67c..1d2831dd0 100644
--- a/runtime/onert/core/include/exec/IExecutor.h
+++ b/runtime/onert/core/include/exec/IExecutor.h
@@ -69,21 +69,6 @@ struct IExecutor
 
 using ExecutorMap = std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>>;
 
-// TODO Move this structure to suitable place
-/**
- * @brief Dynamic allocation info for input tensors
- *        When user sets shape of input having unknown dims after compilation, memory for the input
- * should be allocated before executing kernels. This struct contains information to allocate
- * memory.
- */
-struct DynAllocInfo
-{
-  /// @brief index of input tensor whose memory needs to be allocated at execution time
-  ir::OperandIndex ind;
-};
-
-using DynAllocInfoMap = std::unordered_map<std::shared_ptr<backend::ITensor>, DynAllocInfo>;
-
 } // namespace exec
 } // namespace onert
 
diff --git a/runtime/onert/core/include/ir/Operand.h b/runtime/onert/core/include/ir/Operand.h
index 1b3a43b02..f149a744b 100644
--- a/runtime/onert/core/include/ir/Operand.h
+++ b/runtime/onert/core/include/ir/Operand.h
@@ -40,6 +40,7 @@ public:
   {
     // DO NOTHING
   }
+  explicit Operand(const Operand &) = default;
 
 public:
   const Shape &shape(void) const { return _info.shape(); }
diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h
index aa01eccaa..2f78cc832 100644
--- a/runtime/onert/core/include/ir/OperandIndexSequence.h
+++ b/runtime/onert/core/include/ir/OperandIndexSequence.h
@@ -82,6 +82,8 @@ public:
 public:
   std::vector<OperandIndex>::const_iterator begin(void) const { return _vec.begin(); }
   std::vector<OperandIndex>::const_iterator end(void) const { return _vec.end(); }
+  std::vector<OperandIndex>::iterator begin(void) { return _vec.begin(); }
+  std::vector<OperandIndex>::iterator end(void) { return _vec.end(); }
 
 private:
   std::vector<OperandIndex> _vec;
diff --git a/runtime/onert/core/include/ir/OperandInfo.h b/runtime/onert/core/include/ir/OperandInfo.h
index b8e123027..67aeb0e65 100644
--- a/runtime/onert/core/include/ir/OperandInfo.h
+++ b/runtime/onert/core/include/ir/OperandInfo.h
@@ -117,6 +117,7 @@ public:
 
   MemAllocType memAllocType() const { return _alloc_type; }
   void setAsConstant() { _const = true; }
+  void setAsNonConst() { _const = false; }
   bool isConstant() const
   {
     // Impossible case: constant and dynamic operand
diff --git a/runtime/onert/core/include/ir/Operation.h b/runtime/onert/core/include/ir/Operation.h
index 818bd913b..89f7e340d 100644
--- a/runtime/onert/core/include/ir/Operation.h
+++ b/runtime/onert/core/include/ir/Operation.h
@@ -34,9 +34,12 @@ struct OperationVisitor;
 class Operation
 {
 public:
+  // TODO Remove default parameter
   Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs,
-            const OperandIndexSequence &outputs);
-  explicit Operation(OperandConstraint input_constr);
+            const OperandIndexSequence &outputs,
+            OperandConstraint output_constr = OperandConstraint::createAny());
+  explicit Operation(OperandConstraint input_constr,
+                     OperandConstraint output_constr = OperandConstraint::createAny());
 
   Operation(const Operation &) = default;
   Operation(Operation &&) = default;
@@ -62,6 +65,7 @@ public:
 
 private:
   OperandConstraint _input_constr;
+  OperandConstraint _output_constr;
   OperandIndexSequence _inputs;
   OperandIndexSequence _outputs;
 };
diff --git a/runtime/onert/core/include/ir/Sparsity.h b/runtime/onert/core/include/ir/Sparsity.h
new file mode 100644
index 000000000..ad4d8259b
--- /dev/null
+++ b/runtime/onert/core/include/ir/Sparsity.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+#ifndef __ONERT_IR_SPARSITY_H__
+#define __ONERT_IR_SPARSITY_H__
+
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+namespace onert
+{
+namespace ir
+{
+
+/**
+ * @brief  Structure for Sparse Tensor
+ */
+struct Sparsity
+{
+public:
+  Sparsity() = default;
+  Sparsity(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices,
+           std::vector<int32_t> &&block_size)
+      : _w1_segments(w1_segments), _w1_indices(w1_indices), _block_size(block_size)
+  {
+  }
+
+  /**
+   * @brief Returns segments array. See compressed sparse row format.
+   */
+  const uint16_t *w1_segments() const { return _w1_segments.data(); }
+  /**
+   * @brief Returns indices array. See compressed sparse row format.
+   */
+  const uint16_t *w1_indices() const { return _w1_indices.data(); }
+  /**
+   * @brief Returns block size which is used for block sparsity
+   */
+  const std::vector<int32_t> &block_size() const { return _block_size; }
+
+private:
+  std::vector<uint16_t> _w1_segments;
+  std::vector<uint16_t> _w1_indices;
+  std::vector<int32_t> _block_size;
+};
+
+} // namespace ir
+} // namespace onert
+
+#endif // __ONERT_IR_SPARSITY_H__
diff --git a/runtime/onert/core/include/ir/TypeInfo.h b/runtime/onert/core/include/ir/TypeInfo.h
index 3f7eab4c0..a1ae4d2e4 100644
--- a/runtime/onert/core/include/ir/TypeInfo.h
+++ b/runtime/onert/core/include/ir/TypeInfo.h
@@ -18,9 +18,11 @@
 #define __ONERT_IR_TYPEINFO_H__
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "ir/DataType.h"
+#include "ir/Sparsity.h"
 
 namespace onert
 {
@@ -33,7 +35,7 @@ public:
   TypeInfo() = delete;
 
   explicit TypeInfo(DataType type, float scale = 0, int32_t offset = 0)
-      : _type(type), _scale(scale), _offset(offset), _sparse(false)
+      : _type(type), _scale(scale), _offset(offset), _sparsity(nullptr)
   {
   }
 
@@ -41,18 +43,11 @@ public:
   DataType type() const { return _type; }
   float scale() const { return _scale; }
   int32_t offset() const { return _offset; }
-  bool sparse() const { return _sparse; }
-  const uint16_t *w1_segments() const { return _w1_segments.data(); }
-  const uint16_t *w1_indices() const { return _w1_indices.data(); }
+  const ir::Sparsity *sparsity() const { return _sparsity.get(); }
+  void sparsity(std::shared_ptr<ir::Sparsity> sparsity) { _sparsity = sparsity; }
 
 public:
   void type(const DataType type) { _type = type; }
-  void sparse2DMetadata(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices)
-  {
-    _sparse = true;
-    _w1_segments = w1_segments;
-    _w1_indices = w1_indices;
-  }
 
 private:
   DataType _type;
@@ -60,9 +55,7 @@ private:
   float _scale;
   int32_t _offset;
   // for sparsity
-  bool _sparse;
-  std::vector<uint16_t> _w1_segments;
-  std::vector<uint16_t> _w1_indices;
+  std::shared_ptr<ir::Sparsity> _sparsity;
 };
 
 bool operator==(const TypeInfo &lhs, const TypeInfo &rhs);
diff --git a/runtime/onert/core/include/ir/operation/ArgMax.h b/runtime/onert/core/include/ir/operation/ArgMax.h
index 8400e1f1e..ea7eabb83 100644
--- a/runtime/onert/core/include/ir/operation/ArgMax.h
+++ b/runtime/onert/core/include/ir/operation/ArgMax.h
@@ -31,12 +31,12 @@ class ArgMax : public Operation
 public:
   enum Input
   {
-    INPUT
+    INPUT = 0,
+    AXIS = 1
   };
 
   struct Param
   {
-    int axis;
     DataType output_type;
   };
 
diff --git a/runtime/onert/core/include/ir/operation/LSTM.h b/runtime/onert/core/include/ir/operation/LSTM.h
index 1e6c00bf3..027bc6b42 100644
--- a/runtime/onert/core/include/ir/operation/LSTM.h
+++ b/runtime/onert/core/include/ir/operation/LSTM.h
@@ -26,6 +26,7 @@ namespace ir
 namespace operation
 {
 
+// This operation supports only unidirectional sequence lstm
 class LSTM : public Operation
 {
 public:
@@ -51,6 +52,10 @@ public:
     PROJECTION_BIAS = 17,
     OUTPUT_STATE_IN = 18,
     CELL_STATE_IN = 19,
+    INPUT_LAYER_NORMALIZATION_WEIGHTS = 20,
+    FORGET_LAYER_NORMALIZATION_WEIGHTS = 21,
+    CELL_LAYER_NORMALIZATION_WEIGHTS = 22,
+    OUTPUT_LAYER_NORMALIZATION_WEIGHTS = 23,
   };
 
   enum Output
@@ -66,6 +71,7 @@ public:
     Activation activation;
     float cell_threshold;
     float projection_threshold;
+    bool time_major;
   };
 
 public:
@@ -73,6 +79,7 @@ public:
 
 public:
   void accept(OperationVisitor &v) const override;
+  std::string name() const override;
   OpCode opcode() const final { return OpCode::LSTM; }
 
 public:
diff --git a/runtime/onert/core/include/ir/operation/ResizeBilinear.h b/runtime/onert/core/include/ir/operation/ResizeBilinear.h
index 29aa496d7..ab330c826 100644
--- a/runtime/onert/core/include/ir/operation/ResizeBilinear.h
+++ b/runtime/onert/core/include/ir/operation/ResizeBilinear.h
@@ -34,10 +34,12 @@ public:
   enum Input
   {
     INPUT = 0,
+    SIZE = 1,
   };
 
   struct Param
   {
+    // If the input SIZE exists in inputs, height_out and width_out are not set. Ignore these params
     int32_t height_out;
     int32_t width_out;
     bool align_corners;
diff --git a/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h b/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h
index e4d810eeb..10827803e 100644
--- a/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h
+++ b/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h
@@ -34,10 +34,12 @@ public:
   enum Input
   {
     INPUT = 0,
+    SIZE = 1,
   };
 
   struct Param
   {
+    // If the input SIZE exists in inputs, Be height_out and width_out not set. Ignore these params
     int32_t height_out;
     int32_t width_out;
     bool align_corners;
diff --git a/runtime/onert/core/include/ir/operation/Split.h b/runtime/onert/core/include/ir/operation/Split.h
index 60e0fdf15..c415941a4 100644
--- a/runtime/onert/core/include/ir/operation/Split.h
+++ b/runtime/onert/core/include/ir/operation/Split.h
@@ -29,12 +29,12 @@ class Split : public Operation
 public:
   enum Input
   {
-    INPUT = 0
+    AXIS = 0,
+    INPUT = 1,
   };
 
   struct Param
   {
-    int axis;
     int num_splits;
   };
 
diff --git a/runtime/onert/core/include/ir/operation/Transpose.h b/runtime/onert/core/include/ir/operation/Transpose.h
index 9631f7aaa..665c9bbce 100644
--- a/runtime/onert/core/include/ir/operation/Transpose.h
+++ b/runtime/onert/core/include/ir/operation/Transpose.h
@@ -34,26 +34,15 @@ public:
   enum Input
   {
     INPUT = 0, // for an n-D tensor, specifying the tensor to be transposed.
-  };
-
-  struct Param
-  {
-    std::vector<int> perm;
+    PERMUTATION = 1,
   };
 
 public:
-  Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
-            const Param &param);
+  Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
 
 public:
   void accept(OperationVisitor &v) const override;
   OpCode opcode() const final { return OpCode::Transpose; }
-
-public:
-  const Param &param() const { return _param; }
-
-private:
-  Param _param;
 };
 
 } // namespace operation
diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst
index 5077fad69..30f211011 100644
--- a/runtime/onert/core/include/util/Config.lst
+++ b/runtime/onert/core/include/util/Config.lst
@@ -35,6 +35,7 @@ CONFIG(OP_SEQ_MAX_NODE         , int          , "0")
 CONFIG(TRACE_FILEPATH          , std::string  , "")
 CONFIG(FP16_ENABLE             , bool         , "0")
 CONFIG(RUY_THREADS             , int          , "-1")
+CONFIG(USE_MMAPED_DATA         , bool         , "0")
 
 // Auto-generate all operations
 
diff --git a/runtime/onert/core/include/util/ShapeInference.h b/runtime/onert/core/include/util/ShapeInference.h
index 1ebed48f2..701b835d2 100644
--- a/runtime/onert/core/include/util/ShapeInference.h
+++ b/runtime/onert/core/include/util/ShapeInference.h
@@ -47,7 +47,14 @@ ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank);
 ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape,
                                 const ir::operation::BatchMatMul::Param &param);
 
-ir::Shape inferBroadcastToShape(const ir::Shape wshape, const int32_t *shape_buffer);
+ir::Shape inferBCQFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &cluster_shape,
+                                      const int32_t *cluster_buf);
+
+ir::Shape inferBCQGatherShape(const ir::Shape &indices_shape, const ir::Shape &cluster_shape,
+                              const int32_t *cluster_buf, int rank,
+                              const ir::operation::BCQGather::Param &param);
+
+ir::Shape inferBroadcastToShape(const ir::Shape shp_shape, const int32_t *shp_buf);
 
 ir::Shape inferConcatShape(const Shapes &in_shapes, const ir::operation::Concat::Param &param);
 
@@ -63,7 +70,7 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha
 
 ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis);
 
-ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *buf);
+ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf);
 
 ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &ker_shape);
 
@@ -97,12 +104,12 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp
 ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &input_true_shape,
                            const ir::Shape &input_false_shape);
 
-ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins,
-                          const int32_t *sizes);
+ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
+                          const int32_t *sizes_buf);
 
 ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape,
-                                   const ir::Shape &padding_shape, const int32_t *block_shape_data,
-                                   const int32_t *padding_data);
+                                   const ir::Shape &padding_shape, const int32_t *block_shape_buf,
+                                   const int32_t *padding_buf);
 
 ir::Shape inferSplitShape(const ir::Shape input_shape, int axis_value, int num_splits);
 
@@ -132,9 +139,11 @@ StridedSliceParams buildStridedSliceParams(const T *begin, const T *end, const T
 ir::Shape inferStridedSliceShape(const ir::Shape &input_shape, const StridedSliceParams &op_params,
                                  uint32_t rank);
 
-ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier);
+ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_buf,
+                         const int32_t multiplier_size);
 
-ir::Shape inferTransposeShape(const ir::Shape &in_shape, const std::vector<int> &perm);
+ir::Shape inferTransposeShape(const ir::Shape &in_shape, const int32_t *perm_buf,
+                              const int32_t rank);
 
 ir::Shape inferUnpackShape(const ir::Shape &input_shape, int axis, int rank);
 
diff --git a/runtime/onert/core/include/util/Utils.h b/runtime/onert/core/include/util/Utils.h
index 847fb6971..8a4eea32b 100644
--- a/runtime/onert/core/include/util/Utils.h
+++ b/runtime/onert/core/include/util/Utils.h
@@ -22,6 +22,87 @@
 #ifndef __ONERT_UTIL_UTILS_H__
 #define __ONERT_UTIL_UTILS_H__
 
+#include "ir/Coordinates.h"
+#include "ir/Shape.h"
+
 #define UNUSED_RELEASE(a) (void)(a)
 
+template <size_t from, size_t to, typename Enable = void> struct ForEachDimension
+{
+  template <typename L, typename... Args>
+  static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords,
+                     L &&lambda_function, Args &&... args)
+  {
+    static_assert(from < to, "from must not be less than to");
+    assert(static_cast<int>(to) <= shape.rank());
+    const auto &d = shape.dim(from);
+
+    for (auto v = 0; v < d; v++)
+    {
+      coords.set(from, v);
+      ForEachDimension<from + 1, to>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                             std::forward<Args>(args)...);
+    }
+  }
+};
+
+template <size_t from, size_t to>
+struct ForEachDimension<from, to, typename std::enable_if<from == to>::type>
+{
+  template <typename L, typename... Args>
+  static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords,
+                     L &&lambda_function, Args &&... args)
+  {
+    UNUSED_RELEASE(shape);
+    assert(static_cast<int>(to) <= shape.rank());
+    lambda_function(coords, std::forward<Args>(args)...);
+  }
+};
+
+template <typename L, typename... Args>
+inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args &&... args)
+{
+  assert(shape.rank() > 0);
+  for (auto i = 0; i < shape.rank(); ++i)
+  {
+    assert(shape.dim(i) > 0);
+  }
+
+  onert::ir::Coordinates coords;
+  switch (shape.rank())
+  {
+    case 0:
+      coords.set(0, 0);
+      ForEachDimension<0, 0>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 1:
+      ForEachDimension<0, 1>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 2:
+      ForEachDimension<0, 2>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 3:
+      ForEachDimension<0, 3>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 4:
+      ForEachDimension<0, 4>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 5:
+      ForEachDimension<0, 5>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 6:
+      ForEachDimension<0, 6>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    default:
+      assert(false && "ShapeLoop, 1 <= Shape'rank <= 6");
+      break;
+  }
+}
 #endif // __ONERT_UTIL_UTILS_H__
diff --git a/runtime/libs/ndarray/src/Array.cpp b/runtime/onert/core/src/backend/IPortableTensor.cc
index f9c9de9d3..cec34e780 100644
--- a/runtime/libs/ndarray/src/Array.cpp
+++ b/runtime/onert/core/src/backend/IPortableTensor.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include "ndarray/Array.h"
+#include "backend/IPortableTensor.h"
 
-namespace ndarray
+namespace onert
+{
+namespace backend
 {
 
-template class Array<float>;
-template class Array<int32_t>;
-template class Array<uint32_t>;
-template class Array<uint8_t>;
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+IPortableTensor::~IPortableTensor() {}
 
-} // namespace ndarray
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.h b/runtime/onert/core/src/backend/controlflow/BackendContext.h
new file mode 100644
index 000000000..d179bfde4
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/BackendContext.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include "ExternalContext.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+                 std::shared_ptr<ITensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<IConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<IKernelGenerator> kernel_gen = nullptr,
+                 std::shared_ptr<ITensorRegister> tensor_register = nullptr,
+                 std::shared_ptr<IOptimizer> optimizer = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry, tensor_builder,
+                                       constant_initializer, kernel_gen, tensor_register,
+                                       optimizer),
+        _external_context(std::make_shared<ExternalContext>())
+  {
+  }
+
+  std::shared_ptr<ExternalContext> external_context() { return _external_context; }
+
+private:
+  // NOTE ruy context has a thread pool, and when multiple ruy contexts are created,
+  //      the thread pool is also created in duplicate
+  // TODO Create one ruy context for session
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
index 1288e4c96..77f02969d 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
@@ -17,8 +17,7 @@
 #include "DynamicTensorManager.h"
 
 #include "util/logging.h"
-#include "util/Exceptions.h"
-#include "ir/DataType.h"
+#include "misc/polymorphic_downcast.h"
 
 namespace onert
 {
@@ -33,82 +32,18 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry>
   // DO NOTHING
 }
 
-void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
-{
-  // NOTE Handle user tensors first
-  auto user_tensor = _tensors->getNativeUserTensor(ind);
-  if (user_tensor)
-  {
-    // User tensors cannot be reallocated.
-    auto buffer_size = user_tensor->total_size();
-    auto new_size = new_shape.num_elements() * sizeOfDataType(user_tensor->data_type());
-    if (buffer_size < new_size)
-      throw InsufficientBufferSizeException{"Output buffer size is less than output tensor size"};
-    user_tensor->setShape(new_shape);
-    return;
-  }
-
-  // NOTE Then handle own tensors
-  auto tensor = _tensors->getNativeOwnTensor(ind);
-  assert(tensor);
-
-  bool previously_dynamic = tensor->is_dynamic();
-
-  auto allocTensorMem = [&](bool overwrite = false) {
-    auto capacity = tensor->total_size();
-    auto alloc = _dynamic_mem_mgr->allocate(ind, capacity);
-
-    if (overwrite)
-      tensor->overwriteBuffer(alloc);
-    else
-      tensor->setBuffer(alloc);
-  };
-
-  if (!previously_dynamic)
-  {
-    // TODO deallocate tensor->buffer()
-    // issue is that staticTensorManager might have allocate this memory
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem(true);
-  }
-  else if (tensor->buffer() == nullptr)
-  {
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem();
-  }
-  // when buffer was already allocated and new_shape requires different size
-  else
-  {
-    auto previous_size = tensor->total_size();
-    auto new_size = new_shape.num_elements() * sizeOfDataType(tensor->data_type());
-    if (previous_size != new_size)
-    {
-      _dynamic_mem_mgr->deallocate(ind);
-
-      tensor->setShape(new_shape);
-      tensor->set_dynamic();
-      allocTensorMem(true);
-    }
-    else
-    { // when buffer with same size was already allocated, shape could differ
-      tensor->setShape(new_shape);
-    }
-  }
-}
-
 void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
-  auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout, this);
-  _tensors->setNativeOwnTensor(ind, tensor);
+  auto tensor =
+      std::make_unique<cpu_common::Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr.get());
+  _tensors->setNativeOwnTensor(ind, std::move(tensor));
 }
 
-void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor)
 {
-  _dealloc_tensor_map[op_ind].emplace(operand_ind);
+  _dealloc_tensor_map[op_ind].emplace(tensor);
 }
 
 void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
@@ -118,25 +53,26 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
     return;
 
   auto &input_set = find->second;
-  for (auto input_ind : input_set)
+  for (auto *tensor : input_set)
   {
-    if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
+    if (!tensor->is_dynamic())
       continue;
 
-    _dynamic_mem_mgr->deallocate(input_ind);
-    VERBOSE(DynamicTensorManager) << "Deallocating #" << input_ind.value()
+    _dynamic_mem_mgr->deallocate(tensor);
+
+    auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor);
+    cpu_tensor->resetBuffer();
+
+    VERBOSE(DynamicTensorManager) << "Deallocating a tensor " << (void *)tensor
                                   << " (input of op_ind: " << op_ind.value() << ")" << std::endl;
   }
 }
 
-void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind)
 {
-  if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
-    return;
-
-  _dynamic_mem_mgr->deallocate(output_ind);
-  VERBOSE(DynamicTensorManager) << "Deallocating #" << output_ind.value()
-                                << " (output of a subgraph)" << std::endl;
+  auto ptr = _tensors->getITensor(ind);
+  assert(ptr);
+  return ptr;
 }
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
index dbe388ba2..fb822a917 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
@@ -43,14 +43,16 @@ public:
 
   virtual ~DynamicTensorManager() = default;
 
-  void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) override;
-
   void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
                    ir::Layout backend_layout);
 
-  void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) override;
+  void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override;
   void deallocInput(ir::OperationIndex op_ind) override;
-  void deallocSubgraphOutput(ir::OperandIndex ind) override;
+
+  std::shared_ptr<cpu_common::DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; }
+
+private:
+  const ITensor *getRawITensor(ir::OperandIndex ind);
 
 private:
   /**
@@ -60,9 +62,10 @@ private:
   std::shared_ptr<cpu_common::DynamicMemoryManager> _dynamic_mem_mgr;
   const std::shared_ptr<TensorRegistry> _tensors;
 
-  // contains list of dynamic tensor index, which can be deallocated after running operation
-  // note: this map could contain static tensor index too. Careful use is required.
-  std::unordered_map<ir::OperationIndex, std::unordered_set<ir::OperandIndex>> _dealloc_tensor_map;
+  // contains list of dynamic tensor, which can be deallocated after running operation
+  // note: this map could contain static tensor too. Careful use is required.
+  std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>>
+      _dealloc_tensor_map;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/ExternalContext.h b/runtime/onert/core/src/backend/controlflow/ExternalContext.h
new file mode 100644
index 000000000..58bccb6c6
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/ExternalContext.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
+
+#include <backend/IExternalContext.h>
+#include <util/ConfigSource.h>
+#include <ruy/context.h>
+
+namespace
+{
+const int kDefaultNumThreadpoolThreads = 1;
+}
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+// TODO Unify this with cpu::ExternalContext
+class ExternalContext : public IExternalContext
+{
+public:
+  ExternalContext() : _ruy_context(nullptr)
+  {
+    // setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
+  }
+
+  void setMaxNumThreads(int max_num_threads)
+  {
+    const int target_num_threads =
+        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+    _ruy_context->set_max_num_threads(target_num_threads);
+  }
+
+  ruy::Context *ruy_context() const { return _ruy_context.get(); }
+
+private:
+  const std::unique_ptr<ruy::Context> _ruy_context;
+};
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
index de5a6a5f6..d76ca53e3 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
@@ -58,12 +58,10 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
     dyn_ctx->op_seq = &op_seq;
     dyn_ctx->operations = &_graph.operations();
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->tensor_registry = _tensor_reg;
     dyn_ctx->dynamic_tensor_manager = _dyn_tensor_manager;
 
     _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
   }
-  _return_fn_seq->enableDynamicShapeInferer(true);
 
   for (const auto &op_idx : op_seq.operations())
   {
@@ -78,7 +76,7 @@ void KernelGenerator::visit(const ir::operation::If &node)
   const auto then_subg_index = node.param().then_subg_index;
   const auto else_subg_index = node.param().else_subg_index;
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+  std::vector<backend::ITensor *> input_tensors;
   for (const auto input_index : node.getInputs())
   {
     auto input_tensor = getTensor(input_index);
@@ -86,14 +84,11 @@ void KernelGenerator::visit(const ir::operation::If &node)
     input_tensors.emplace_back(input_tensor);
   }
 
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
-  exec::DynAllocInfoMap outputs_dyn_alloc_info;
+  std::vector<backend::ITensor *> output_tensors;
   for (const auto output_index : node.getOutputs())
   {
     auto output_tensor = getTensor(output_index);
-
     output_tensors.emplace_back(output_tensor);
-    outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index};
   }
 
   // IfLayer just set ExecutorMap instead of then and else executor to avoid complexity of
@@ -101,8 +96,8 @@ void KernelGenerator::visit(const ir::operation::If &node)
   const auto cond_tensor = input_tensors.front();
   input_tensors.erase(input_tensors.begin());
   auto fn = std::make_unique<::onert::backend::controlflow::kernel::IfLayer>(
-      cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, outputs_dyn_alloc_info,
-      then_subg_index, else_subg_index, _executor_map);
+      cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, then_subg_index,
+      else_subg_index, _executor_map);
 
   _return_fn = std::move(fn);
 }
@@ -113,14 +108,10 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto input_index{node.getInputs().at(0)};
 
   // Add PermuteLayer
-  std::vector<std::shared_ptr<ITensor>> output_tensors{getTensor(output_index)};
-  std::vector<std::shared_ptr<ITensor>> input_tensors{getTensor(input_index)};
-  std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
-  outputs_dyn_alloc_info[output_tensors.at(0)] = exec::DynAllocInfo{output_index};
-
-  auto fn =
-      std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors, outputs_dyn_alloc_info);
+  std::vector<ITensor *> output_tensors{getTensor(output_index)};
+  std::vector<ITensor *> input_tensors{getTensor(input_index)};
 
+  auto fn = std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors);
   _return_fn = std::move(fn);
 }
 
@@ -131,7 +122,7 @@ void KernelGenerator::visit(const ir::operation::While &node)
 
   // This op does not support input as a constant, because controlflow backend does not have
   // TensorBuilder
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+  std::vector<backend::ITensor *> input_tensors;
   for (const auto input_index : node.getInputs())
   {
     auto input_tensor = getTensor(input_index);
@@ -139,29 +130,25 @@ void KernelGenerator::visit(const ir::operation::While &node)
     input_tensors.emplace_back(input_tensor);
   }
 
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
-  std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
+  std::vector<backend::ITensor *> output_tensors;
   for (const auto output_index : node.getOutputs())
   {
     auto output_tensor = getTensor(output_index);
-
     output_tensors.emplace_back(output_tensor);
-
-    outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index};
   }
 
   // WhileLayer just set ExecutorMap instead of cond and body executor to avoid complexity of
   // creating executor recusively
   auto fn = std::make_unique<::onert::backend::controlflow::kernel::WhileLayer>(
-      input_tensors, output_tensors, node.getOutputs(), _graph, outputs_dyn_alloc_info,
-      cond_subg_index, body_subg_index, _executor_map);
+      input_tensors, output_tensors, node.getOutputs(), _graph, cond_subg_index, body_subg_index,
+      _executor_map);
 
   _return_fn = std::move(fn);
 }
 
-std::shared_ptr<backend::ITensor> KernelGenerator::getTensor(const ir::OperandIndex &index)
+backend::ITensor *KernelGenerator::getTensor(const ir::OperandIndex &index)
 {
-  std::shared_ptr<backend::ITensor> ret = _tensor_registries.getITensor(index);
+  backend::ITensor *ret = _tensor_registries.getITensor(index);
   assert(ret != nullptr);
   return ret;
 }
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
index b84a810e4..ce248913f 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
@@ -56,7 +56,7 @@ public:
   void visit(const ir::operation::While &) override;
 
 private:
-  std::shared_ptr<backend::ITensor> getTensor(const ir::OperandIndex &index);
+  backend::ITensor *getTensor(const ir::OperandIndex &index);
 
 private:
   const ir::Graph &_graph;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
index e5c3f5fd5..7d0ff201f 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
@@ -29,8 +29,8 @@ namespace controlflow
 
 TensorBuilder::TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg)
     : _tensor_reg{tensor_reg}, _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg)},
-      _static_tensor_mgr{
-          new cpu_common::StaticTensorManager(_tensor_reg->base_reg(), _dynamic_tensor_mgr.get())}
+      _static_tensor_mgr{new cpu_common::StaticTensorManager(
+          _tensor_reg->base_reg(), _dynamic_tensor_mgr->dynamic_mem_mgr().get())}
 {
   /* empty */
 }
@@ -101,25 +101,14 @@ void TensorBuilder::allocate()
   //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
 }
 
-std::shared_ptr<cpu_common::Tensor> TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind)
+IDynamicTensorManager *TensorBuilder::dynamicTensorManager(void)
 {
-  return _tensor_reg->getNativeOwnTensor(ind);
-}
-
-std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
-{
-  return std::move(_static_tensor_mgr);
+  return _dynamic_tensor_mgr.get();
 }
 
-std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
+cpu_common::Tensor *TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind)
 {
-  return std::move(_dynamic_tensor_mgr);
-}
-
-void TensorBuilder::setNativeUserTensor(const ir::OperandIndex &ind,
-                                        const std::shared_ptr<UserTensor> &tensor)
-{
-  _tensor_reg->setNativeUserTensor(ind, tensor);
+  return _tensor_reg->getNativeOwnTensor(ind);
 }
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
index 2f2a2c47e..695994761 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
@@ -27,7 +27,6 @@
 #include <unordered_map>
 
 #include "DynamicTensorManager.h"
-#include "UserTensorRegistry.h"
 
 namespace onert
 {
@@ -59,20 +58,15 @@ public:
   void allocate() override;
   void postFunctionPrepare() override { /* DO NOTHING */}
 
-  std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override;
-
-  IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); }
-
-  std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) override;
+  IDynamicTensorManager *dynamicTensorManager(void) override;
 
   /**
    * @brief Get tensor with a specific OperandIndex.
    * @param ind OperandIndex for the tensor. There must exist a tensor with this ind.
    *        If not, program will crash with assert or exception.
-   * @return shared_ptr<operand::Tensor>
+   * @return operand::Tensor *
    */
-  std::shared_ptr<cpu_common::Tensor> nativeOwnTensorAt(const ir::OperandIndex &ind);
-  void setNativeUserTensor(const ir::OperandIndex &ind, const std::shared_ptr<UserTensor> &tensor);
+  cpu_common::Tensor *nativeOwnTensorAt(const ir::OperandIndex &ind);
 
 private:
   const std::shared_ptr<TensorRegistry> _tensor_reg;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
index 678c5b73b..94f71bb9c 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
+++ b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
@@ -48,7 +48,7 @@ class TensorRegistry : public ITensorRegistry
 public:
   TensorRegistry() : _base_reg{new cpu_common::TensorRegistry} {}
 
-  std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
+  ITensor *getITensor(const ir::OperandIndex &ind) override
   {
     auto base_tensor = _base_reg->getITensor(ind);
     if (base_tensor)
@@ -56,7 +56,7 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
+  ITensor *getNativeITensor(const ir::OperandIndex &ind) override
   {
     auto base_tensor = _base_reg->getNativeITensor(ind);
     if (base_tensor)
@@ -64,7 +64,7 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
+  IPortableTensor *getPortableTensor(const ir::OperandIndex &ind)
   {
     auto base_tensor = _base_reg->getPortableTensor(ind);
     if (base_tensor)
@@ -72,7 +72,7 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<IPortableTensor> getNativeTensor(const ir::OperandIndex &ind)
+  IPortableTensor *getNativeTensor(const ir::OperandIndex &ind)
   {
     auto base_tensor = _base_reg->getNativeTensor(ind);
     if (base_tensor)
@@ -80,21 +80,20 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<Tensor> getNativeOwnTensor(const ir::OperandIndex &ind)
+  Tensor *getNativeOwnTensor(const ir::OperandIndex &ind)
   {
     return _base_reg->getNativeTensor(ind);
   }
 
-  std::shared_ptr<UserTensor> getNativeUserTensor(const ir::OperandIndex &ind)
+  UserTensor *getNativeUserTensor(const ir::OperandIndex &ind)
   {
     auto tensor = _native_user_tensors.find(ind);
     if (tensor != _native_user_tensors.end())
-      return tensor->second;
+      return tensor->second.get();
     return nullptr;
   }
 
-  bool setMigrantTensor(const ir::OperandIndex &ind,
-                        const std::shared_ptr<IPortableTensor> &tensor) override
+  bool setMigrantTensor(const ir::OperandIndex &ind, IPortableTensor *tensor) override
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
@@ -102,21 +101,21 @@ public:
     return true;
   }
 
-  void setNativeOwnTensor(ir::OperandIndex ind, const std::shared_ptr<Tensor> &tensor)
+  void setNativeOwnTensor(ir::OperandIndex ind, std::unique_ptr<Tensor> &&tensor)
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
-    _base_reg->setNativeTensor(ind, tensor);
+    _base_reg->setNativeTensor(ind, std::move(tensor));
   }
 
-  void setNativeUserTensor(ir::OperandIndex ind, const std::shared_ptr<UserTensor> &tensor)
+  void setNativeUserTensor(ir::OperandIndex ind, std::unique_ptr<UserTensor> &&tensor)
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
-    _native_user_tensors[ind] = tensor;
+    _native_user_tensors[ind] = std::move(tensor);
   }
 
-  const ir::OperandIndexMap<std::shared_ptr<UserTensor>> &native_user_tensors()
+  const ir::OperandIndexMap<std::unique_ptr<UserTensor>> &native_user_tensors()
   {
     return _native_user_tensors;
   }
@@ -124,7 +123,7 @@ public:
 
 private:
   std::shared_ptr<cpu_common::TensorRegistry> _base_reg;
-  ir::OperandIndexMap<std::shared_ptr<UserTensor>> _native_user_tensors;
+  ir::OperandIndexMap<std::unique_ptr<UserTensor>> _native_user_tensors;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.cc b/runtime/onert/core/src/backend/controlflow/UserTensor.cc
index c8e2ebade..5081a90ea 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.cc
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.cc
@@ -16,6 +16,9 @@
 
 #include "UserTensor.h"
 
+#include "util/Exceptions.h"
+#include "ir/DataType.h"
+
 namespace onert
 {
 namespace backend
@@ -35,6 +38,16 @@ size_t UserTensor::calcOffset(const ir::Coordinates &coords) const
   return offset;
 }
 
+bool UserTensor::applyShape(const ir::Shape &new_shape)
+{
+  // User tensors cannot be reallocated.
+  auto new_size = new_shape.num_elements() * ir::sizeOfDataType(data_type());
+  if (total_size() < new_size)
+    throw InsufficientBufferSizeException{"User given buffer size is too small."};
+  setShape(new_shape);
+  return true;
+}
+
 } // namespace controlflow
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
index 9be33595d..7aa62a8a9 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
@@ -38,16 +38,12 @@ namespace controlflow
 class UserTensor : public IPortableTensor
 {
 public:
-  UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size,
-             IDynamicTensorManager *dynamic_tensor_manager)
-      : _info{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false},
-        _dynamic_tensor_manager{dynamic_tensor_manager}
+  UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size)
+      : IPortableTensor{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false}
   {
   }
 
-  UserTensor(const ir::OperandInfo &info, ir::Layout layout,
-             IDynamicTensorManager *dynamic_tensor_manager)
-      : UserTensor{info, layout, nullptr, 0, dynamic_tensor_manager}
+  UserTensor(const ir::OperandInfo &info, ir::Layout layout) : UserTensor{info, layout, nullptr, 0}
   {
   }
 
@@ -73,15 +69,13 @@ public:
   ir::Shape getShape() const override { return _info.shape(); }
   void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
   bool is_constant() const override { return false; }
-  IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; }
+  bool applyShape(const ir::Shape &) override;
 
 private:
-  ir::OperandInfo _info;
   ir::Layout _layout;
   uint8_t *_buffer;
   size_t _size;
   bool _dynamic;
-  IDynamicTensorManager *_dynamic_tensor_manager;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
index 8377c7183..c0329acd8 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
@@ -30,16 +30,13 @@ namespace controlflow
 namespace kernel
 {
 
-IfLayer::IfLayer(const std::shared_ptr<backend::ITensor> &cond_tensor,
-                 const std::vector<std::shared_ptr<backend::ITensor>> input_tensors,
-                 const std::vector<std::shared_ptr<backend::ITensor>> output_tensors,
+IfLayer::IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors,
+                 const std::vector<backend::ITensor *> output_tensors,
                  const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-                 const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
                  const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
                  exec::ExecutorMap *executor_map)
     : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors},
-      _output_indices{output_indices}, _graph{graph},
-      _outputs_dyn_alloc_info{outputs_dyn_alloc_info}, _then_subg_index{then_subg_index},
+      _output_indices{output_indices}, _graph{graph}, _then_subg_index{then_subg_index},
       _else_subg_index{else_subg_index}, _executor_map{executor_map}
 {
   // At this point, executor_map may not have executors of then subg and else subg
@@ -63,21 +60,24 @@ void IfLayer::run()
   };
 
   exec::ExecutorBase *subg_exec = nullptr;
-  if (getResultCond(_cond_tensor.get()))
+  bool cond_result = getResultCond(_cond_tensor);
+  if (cond_result)
   {
+    VERBOSE(If) << "Call to $" << _then_subg_index << " (then)" << std::endl;
     subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
         _executor_map->at(_then_subg_index).get());
   }
   else
   {
+    VERBOSE(If) << "Call to $" << _else_subg_index << " (else)" << std::endl;
     subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
         _executor_map->at(_else_subg_index).get());
   }
 
   const auto &subg_graph = subg_exec->graph();
 
-  std::vector<std::shared_ptr<backend::ITensor>> src_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> dst_tensors;
+  std::vector<backend::ITensor *> src_tensors;
+  std::vector<backend::ITensor *> dst_tensors;
   // Add tensors used in subgraph or contained in outputs of subgraph
   assert(subg_graph.getInputs().size() == _input_tensors.size());
   assert(subg_graph.getInputs().size() == subg_exec->getInputTensors().size());
@@ -91,9 +91,8 @@ void IfLayer::run()
       dst_tensors.emplace_back(subg_exec->getInputTensors().at(i));
     }
   }
-  const auto &subg_inputs_dyn_alloc_info = subg_exec->getInputsDynamicAllocInfo();
   const auto permute_op_input_to_subg_input =
-      std::make_shared<PermuteLayer>(src_tensors, dst_tensors, subg_inputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(src_tensors, dst_tensors);
 
   // Add tensors used as output of operation or contained in outputs of operation
   src_tensors.clear();
@@ -111,7 +110,7 @@ void IfLayer::run()
     }
   }
   const auto permute_subg_output_to_op_output =
-      std::make_shared<PermuteLayer>(src_tensors, dst_tensors, _outputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(src_tensors, dst_tensors);
 
   // Remove copying of unused tensor
   permute_op_input_to_subg_input->prepare();
@@ -120,6 +119,8 @@ void IfLayer::run()
   // Copy & run
   subg_exec->execute(_input_tensors, permute_op_input_to_subg_input);
   permute_subg_output_to_op_output->run();
+  VERBOSE(If) << "Return from $" << (cond_result ? _then_subg_index : _else_subg_index)
+              << std::endl;
 }
 
 } // namespace kernel
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
index ef3a6e6f6..1461388dc 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
@@ -32,11 +32,9 @@ namespace kernel
 class IfLayer : public ::onert::exec::IFunction
 {
 public:
-  IfLayer(const std::shared_ptr<backend::ITensor> &cond_tensor,
-          const std::vector<std::shared_ptr<backend::ITensor>> input_tensors,
-          const std::vector<std::shared_ptr<backend::ITensor>> output_tensors,
+  IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors,
+          const std::vector<backend::ITensor *> output_tensors,
           const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-          const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
           const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
           exec::ExecutorMap *executor_map);
 
@@ -44,12 +42,11 @@ public:
   void run() override;
 
 private:
-  const std::shared_ptr<backend::ITensor> _cond_tensor;
-  const std::vector<std::shared_ptr<backend::ITensor>> _input_tensors;
-  const std::vector<std::shared_ptr<backend::ITensor>> _output_tensors;
+  backend::ITensor *_cond_tensor;
+  const std::vector<backend::ITensor *> _input_tensors;
+  const std::vector<backend::ITensor *> _output_tensors;
   const ir::OperandIndexSequence &_output_indices;
   const ir::Graph &_graph;
-  const exec::DynAllocInfoMap _outputs_dyn_alloc_info;
   const ir::SubgraphIndex _then_subg_index;
   const ir::SubgraphIndex _else_subg_index;
   exec::ExecutorMap *_executor_map;
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
index e8f1ea679..49fbb33c4 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
@@ -54,12 +54,9 @@ void PermuteLayer::run()
 
       try
       {
-        const auto dst_index = _dst_dyn_alloc_info_map.at(dst_tensor).ind;
-        auto dyn_tensor_manager = dst_tensor->dynamic_tensor_manager();
-        if (!dyn_tensor_manager)
+        if (!dst_tensor->applyShape(new_shape))
           throw std::runtime_error{
               "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};
-        dyn_tensor_manager->applyShape(dst_index, new_shape);
         assert(dst_tensor->buffer() != nullptr);
       }
       catch (const std::out_of_range &e)
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
index 403ac770d..8129403a5 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
@@ -33,10 +33,7 @@ namespace kernel
 class PermuteLayer : public onert::exec::IPermuteFunction
 {
 public:
-  PermuteLayer(const std::vector<std::shared_ptr<ITensor>> &src_tensors,
-               const std::vector<std::shared_ptr<ITensor>> &dst_tensors,
-               const exec::DynAllocInfoMap &dst_dyn_alloc_info_map)
-      : _dst_dyn_alloc_info_map{dst_dyn_alloc_info_map}
+  PermuteLayer(const std::vector<ITensor *> &src_tensors, const std::vector<ITensor *> &dst_tensors)
   {
     assert(src_tensors.size() == dst_tensors.size());
     _src_tensors = src_tensors;
@@ -64,9 +61,6 @@ public:
   }
 
   void run() override;
-
-private:
-  const exec::DynAllocInfoMap _dst_dyn_alloc_info_map;
 };
 
 } // namespace kernel
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
index 50936e5f6..225f0dd7c 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
@@ -30,16 +30,14 @@ namespace controlflow
 namespace kernel
 {
 
-WhileLayer::WhileLayer(const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                       const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
+WhileLayer::WhileLayer(const std::vector<backend::ITensor *> input_tensors,
+                       const std::vector<backend::ITensor *> output_tensors,
                        const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-                       const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
                        const ir::SubgraphIndex &cond_subg_index,
                        const ir::SubgraphIndex &body_subg_index, exec::ExecutorMap *executor_map)
     : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index},
       _output_indices{output_indices}, _graph{graph}, _input_tensors{input_tensors},
-      _output_tensors{output_tensors}, _outputs_dyn_alloc_info{outputs_dyn_alloc_info},
-      _executor_map{executor_map}
+      _output_tensors{output_tensors}, _executor_map{executor_map}
 {
   // At this point, executor_map may not have executors of cond subg and body subg
 }
@@ -62,15 +60,13 @@ void WhileLayer::run()
       _executor_map->at(_body_subg_index).get());
 
   const auto &cond_graph = cond_exec->graph();
-  const auto &cond_inputs_dyn_alloc = cond_exec->getInputsDynamicAllocInfo();
   const auto &body_graph = body_exec->graph();
-  const auto &body_inputs_dyn_alloc = body_exec->getInputsDynamicAllocInfo();
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> cond_input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> body_input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> body_output_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::ITensor *> cond_input_tensors;
+  std::vector<backend::ITensor *> body_input_tensors;
+  std::vector<backend::ITensor *> body_output_tensors;
+  std::vector<backend::ITensor *> output_tensors;
 
   // Add only used tensors in cond subgraph
   assert(cond_graph.getInputs().size() == _input_tensors.size());
@@ -85,7 +81,7 @@ void WhileLayer::run()
     }
   }
   const auto permute_op_input_to_cond_input =
-      std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors, cond_inputs_dyn_alloc);
+      std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors);
 
   // Add only used tensors among outputs of while operation
   assert(_output_indices.size() == _input_tensors.size());
@@ -103,7 +99,7 @@ void WhileLayer::run()
     }
   }
   const auto permute_op_input_to_op_output =
-      std::make_shared<PermuteLayer>(input_tensors, output_tensors, _outputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(input_tensors, output_tensors);
 
   // Add all tensors with unused tensors in body subgraph because unused input tensors will be
   // copied output tensors in body subgraph
@@ -111,7 +107,7 @@ void WhileLayer::run()
   input_tensors = _input_tensors;
   body_input_tensors = body_exec->getInputTensors();
   const auto permute_op_input_to_body_input =
-      std::make_shared<PermuteLayer>(input_tensors, body_input_tensors, body_inputs_dyn_alloc);
+      std::make_shared<PermuteLayer>(input_tensors, body_input_tensors);
 
   // Add only used tensors in cond subgraph
   assert(cond_graph.getInputs().size() == body_exec->getOutputTensors().size());
@@ -127,8 +123,8 @@ void WhileLayer::run()
       cond_input_tensors.emplace_back(cond_exec->getInputTensors().at(i));
     }
   }
-  const auto permute_body_output_to_cond_input = std::make_shared<PermuteLayer>(
-      body_output_tensors, cond_input_tensors, cond_inputs_dyn_alloc);
+  const auto permute_body_output_to_cond_input =
+      std::make_shared<PermuteLayer>(body_output_tensors, cond_input_tensors);
 
   // Add only used tensors in body subgraph
   assert(body_graph.getInputs().size() == body_exec->getOutputTensors().size());
@@ -146,8 +142,8 @@ void WhileLayer::run()
       body_input_tensors.emplace_back(body_exec->getInputTensors().at(i));
     }
   }
-  const auto permute_body_output_to_body_input = std::make_shared<PermuteLayer>(
-      body_output_tensors, body_input_tensors, body_inputs_dyn_alloc);
+  const auto permute_body_output_to_body_input =
+      std::make_shared<PermuteLayer>(body_output_tensors, body_input_tensors);
 
   // Add only used tensors among outputs of while operation
   assert(_output_indices.size() == body_exec->getOutputTensors().size());
@@ -165,7 +161,7 @@ void WhileLayer::run()
     }
   }
   const auto permute_body_output_to_op_output =
-      std::make_shared<PermuteLayer>(body_output_tensors, output_tensors, _outputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(body_output_tensors, output_tensors);
 
   // Remove copying of unused tensor
   permute_op_input_to_cond_input->prepare();
@@ -175,7 +171,9 @@ void WhileLayer::run()
   permute_body_output_to_body_input->prepare();
   permute_body_output_to_op_output->prepare();
 
+  VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl;
   cond_exec->execute(_input_tensors, permute_op_input_to_cond_input);
+  VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl;
 
   assert(cond_exec->getOutputTensors().size() == 1);
   auto &cond_output_tensor = cond_exec->getOutputTensors().at(0);
@@ -186,21 +184,27 @@ void WhileLayer::run()
   };
 
   const auto body_execute_with_op_inputs = [&]() {
+    VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl;
     body_exec->execute(_input_tensors, permute_op_input_to_body_input);
+    VERBOSE(While) << "Return from $" << _body_subg_index << std::endl;
   };
 
   const auto body_execute_with_body_outputs = [&]() {
+    VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl;
     body_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_body_input);
+    VERBOSE(While) << "Return from $" << _body_subg_index << std::endl;
   };
 
   std::function<void()> body_execute = body_execute_with_op_inputs;
   const auto cond_execute = [&]() {
+    VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl;
     cond_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_cond_input);
+    VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl;
   };
   auto permute_to_outputs_fn = permute_op_input_to_op_output;
 
   // Loop while Cond subgraph's output is true
-  while (getResultCond(cond_output_tensor.get()))
+  while (getResultCond(cond_output_tensor))
   {
     body_execute();
     cond_execute();
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
index ebca8acdc..9dae49281 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
@@ -35,10 +35,9 @@ namespace kernel
 class WhileLayer : public ::onert::exec::IFunction
 {
 public:
-  WhileLayer(const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-             const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
+  WhileLayer(const std::vector<backend::ITensor *> input_tensors,
+             const std::vector<backend::ITensor *> output_tensors,
              const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-             const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
              const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index,
              exec::ExecutorMap *executor_map);
 
@@ -50,9 +49,8 @@ private:
   const ir::SubgraphIndex _body_subg_index;
   const ir::OperandIndexSequence &_output_indices;
   const ir::Graph &_graph;
-  const std::vector<std::shared_ptr<backend::ITensor>> _input_tensors;
-  const std::vector<std::shared_ptr<backend::ITensor>> _output_tensors;
-  const exec::DynAllocInfoMap _outputs_dyn_alloc_info;
+  const std::vector<backend::ITensor *> _input_tensors;
+  const std::vector<backend::ITensor *> _output_tensors;
   exec::ExecutorMap *_executor_map;
 };
 
diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
index f7ce3d011..740248ccd 100644
--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
@@ -17,6 +17,7 @@
 #include "backend/cpu_common/DynamicTensorManager.h"
 
 #include "util/logging.h"
+#include "misc/polymorphic_downcast.h"
 
 namespace onert
 {
@@ -31,71 +32,18 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry>
   // DO NOTHING
 }
 
-void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
-{
-  VERBOSE_F() << ind << std::endl;
-
-  auto tensor = _tensors->getNativeTensor(ind);
-  assert(tensor);
-
-  bool previously_dynamic = tensor->is_dynamic();
-
-  auto allocTensorMem = [&](bool overwrite = false) {
-    auto capacity = tensor->total_size();
-    auto alloc = _dynamic_mem_mgr->allocate(ind, capacity);
-
-    if (overwrite)
-      tensor->overwriteBuffer(alloc);
-    else
-      tensor->setBuffer(alloc);
-  };
-
-  if (!previously_dynamic)
-  {
-    // TODO deallocate tensor->buffer()
-    // issue is that staticTensorManager might have allocate this memory
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem(true);
-  }
-  else if (tensor->buffer() == nullptr)
-  {
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem();
-  }
-  // when buffer was already allocated and new_shape requires different size
-  else
-  {
-    auto previous_size = tensor->total_size();
-    auto new_size = new_shape.num_elements() * sizeOfDataType(tensor->data_type());
-    if (previous_size != new_size)
-    {
-      _dynamic_mem_mgr->deallocate(ind);
-
-      tensor->setShape(new_shape);
-      tensor->set_dynamic();
-      allocTensorMem(true);
-    }
-    else
-    { // when buffer with same size was already allocated, shape could differ
-      tensor->setShape(new_shape);
-    }
-  }
-}
-
 void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
   assert(_tensors->getNativeTensor(ind) == nullptr);
-  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, this);
-  _tensors->setNativeTensor(ind, tensor);
+  auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr.get());
+  _tensors->setNativeTensor(ind, std::move(tensor));
 }
 
-void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor)
 {
-  _dealloc_tensor_map[op_ind].emplace(operand_ind);
+  _dealloc_tensor_map[op_ind].emplace(tensor);
 }
 
 void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
@@ -105,31 +53,26 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
     return;
 
   auto &input_set = find->second;
-  for (auto input_ind : input_set)
+  for (auto *tensor : input_set)
   {
-    auto *tensor = _tensors->getNativeTensor(input_ind).get();
     if (!tensor->is_dynamic())
       continue;
 
-    _dynamic_mem_mgr->deallocate(input_ind);
-    tensor->resetBuffer();
+    _dynamic_mem_mgr->deallocate(tensor);
 
-    VERBOSE(DynamicTensorManager) << "Deallocating #" << input_ind.value()
+    auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor);
+    cpu_tensor->resetBuffer();
+
+    VERBOSE(DynamicTensorManager) << "Deallocating tensor " << (void *)cpu_tensor
                                   << " (input of op_ind: " << op_ind.value() << ")" << std::endl;
   }
 }
 
-void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind)
 {
-  auto *tensor = _tensors->getNativeTensor(output_ind).get();
-  if (!tensor->is_dynamic())
-    return;
-
-  _dynamic_mem_mgr->deallocate(output_ind);
-  tensor->resetBuffer();
-
-  VERBOSE(DynamicTensorManager) << "Deallocating #" << output_ind.value()
-                                << " (output of a subgraph)" << std::endl;
+  auto ptr = _tensors->getITensor(ind);
+  assert(ptr);
+  return ptr;
 }
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc b/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
index 8cb9c22ca..9f179d9ee 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
@@ -20,6 +20,7 @@
 
 #include "MemoryPlannerFactory.h"
 #include "util/ConfigSource.h"
+#include "util/logging.h"
 
 namespace onert
 {
@@ -70,20 +71,20 @@ uint8_t *MemoryManager::getBuffer(const ir::OperandIndex &ind) const
   return _mem_alloc->base() + mem_blk.offset;
 }
 
-std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ir::OperandIndex &ind,
+std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ITensor *tensor,
                                                                       uint32_t capacity)
 {
-  auto find = _mem_alloc_map.find(ind);
+  auto find = _mem_alloc_map.find(tensor);
   if (find != _mem_alloc_map.end())
     throw std::runtime_error("Cannot allocate memory for a tensor. It was already allocated.");
 
-  _mem_alloc_map[ind] = std::make_shared<cpu_common::Allocator>(capacity);
-  return _mem_alloc_map[ind];
+  _mem_alloc_map[tensor] = std::make_shared<cpu_common::Allocator>(capacity);
+  return _mem_alloc_map[tensor];
 }
 
-void DynamicMemoryManager::deallocate(const ir::OperandIndex &ind)
+void DynamicMemoryManager::deallocate(const ITensor *tensor)
 {
-  auto find = _mem_alloc_map.find(ind);
+  auto find = _mem_alloc_map.find(tensor);
   if (find == _mem_alloc_map.end())
     throw std::runtime_error("Cannot find Allocator for the requested index");
 
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
index 440f70c93..cac43babe 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
@@ -27,9 +27,9 @@ namespace cpu_common
 {
 
 StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
-                                         IDynamicTensorManager *dynamic_tensor_manager)
+                                         DynamicMemoryManager *dynamic_mem_mgr)
     : _const_mgr{new DynamicMemoryManager()}, _nonconst_mgr{new MemoryManager()}, _tensors{reg},
-      _dynamic_tensor_manager{dynamic_tensor_manager}
+      _dynamic_mem_mgr{dynamic_mem_mgr}
 {
   // DO NOTHING
 }
@@ -39,10 +39,10 @@ void StaticTensorManager::allocateConsts(void)
   for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
-    auto tensor = pair.second;
+    auto tensor = pair.second.get();
     if (_as_constants[ind])
     {
-      auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
+      auto mem_alloc = _const_mgr->allocate(_tensors->getITensor(ind), tensor->total_size());
       tensor->setBuffer(mem_alloc);
       auto buffer = mem_alloc->base();
       VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
@@ -59,7 +59,7 @@ void StaticTensorManager::allocateNonconsts(void)
   for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
-    auto tensor = pair.second;
+    auto tensor = pair.second.get();
     if (!_as_constants[ind] && !tensor->is_dynamic())
     {
       auto *buffer = _nonconst_mgr->getBuffer(ind);
@@ -80,8 +80,8 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
                                       bool as_const)
 {
   assert(!_tensors->getNativeTensor(ind));
-  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager);
-  _tensors->setNativeTensor(ind, tensor);
+  auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr);
+  _tensors->setNativeTensor(ind, std::move(tensor));
   _as_constants[ind] = as_const;
 }
 
diff --git a/runtime/onert/core/src/backend/cpu_common/Tensor.cc b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
index f34564dd9..d3dcf9a6d 100644
--- a/runtime/onert/core/src/backend/cpu_common/Tensor.cc
+++ b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
@@ -16,6 +16,9 @@
 
 #include "backend/cpu_common/Tensor.h"
 
+#include "ir/DataType.h"
+#include "backend/cpu_common/MemoryManager.h"
+
 namespace onert
 {
 namespace backend
@@ -23,6 +26,8 @@ namespace backend
 namespace cpu_common
 {
 
+Tensor::~Tensor() {}
+
 size_t Tensor::calcOffset(const ir::Coordinates &coords) const
 {
   size_t rank = num_dimensions();
@@ -38,6 +43,55 @@ size_t Tensor::calcOffset(const ir::Coordinates &coords) const
 
 void Tensor::setShape(const ir::Shape &new_shape) { _info.shape(new_shape); }
 
+bool Tensor::applyShape(const ir::Shape &new_shape)
+{
+  bool previously_dynamic = is_dynamic();
+
+  auto allocTensorMem = [&](bool overwrite = false) {
+    auto capacity = total_size();
+    auto alloc = _dynamic_mem_mgr->allocate(this, capacity);
+
+    if (overwrite)
+      overwriteBuffer(alloc);
+    else
+      setBuffer(alloc);
+  };
+
+  if (!previously_dynamic)
+  {
+    // TODO deallocate tensor->buffer()
+    // issue is that staticTensorManager might have allocate this memory
+    setShape(new_shape);
+    set_dynamic();
+    allocTensorMem(true);
+  }
+  else if (buffer() == nullptr)
+  {
+    setShape(new_shape);
+    set_dynamic();
+    allocTensorMem();
+  }
+  // when buffer was already allocated and new_shape requires different size
+  else
+  {
+    auto previous_size = total_size();
+    auto new_size = new_shape.num_elements() * ir::sizeOfDataType(data_type());
+    if (previous_size != new_size)
+    {
+      _dynamic_mem_mgr->deallocate(this);
+
+      setShape(new_shape);
+      set_dynamic();
+      allocTensorMem(true);
+    }
+    else
+    { // when buffer with same size was already allocated, shape could differ
+      setShape(new_shape);
+    }
+  }
+  return true;
+}
+
 } // namespace cpu_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/BackendManager.cc b/runtime/onert/core/src/compiler/BackendManager.cc
index db7a14a96..0093f50fd 100644
--- a/runtime/onert/core/src/compiler/BackendManager.cc
+++ b/runtime/onert/core/src/compiler/BackendManager.cc
@@ -70,31 +70,18 @@ void BackendManager::loadBackend(const std::string &backend)
   }
 
   // TODO Remove indentation
-  // Workaround If backend have dynamic library with "-boost" suffix naming,
-  //            BackendManager load library with "-boost" suffix instead of library without suffix
-  //            This feature is used for custom backend extension to support additional operations
   {
-    const std::string backend_boost_so = "libbackend_" + backend + "-boost" + SHARED_LIB_EXT;
     const std::string backend_so = "libbackend_" + backend + SHARED_LIB_EXT;
+    void *handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
 
-    void *handle = dlopen(backend_boost_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
     if (handle == nullptr)
     {
-      handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
-
-      if (handle == nullptr)
-      {
-        VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl;
-        return;
-      }
-
-      VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n";
-    }
-    else
-    {
-      VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_boost_so << "\n";
+      VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl;
+      return;
     }
 
+    VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n";
+
     {
       // load object creator function
       auto backend_create = (backend_create_t)dlsym(handle, "onert_backend_create");
diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc
index 93dbbc3b5..12b582b35 100644
--- a/runtime/onert/core/src/compiler/Compiler.cc
+++ b/runtime/onert/core/src/compiler/Compiler.cc
@@ -19,6 +19,7 @@
 #include "ParamChecker.h"
 #include "ExecutorFactory.h"
 #include "OperationValidator.h"
+#include "ShapeValidator.h"
 #include "Fp32ToFp16Converter.h"
 
 #include <backend/controlflow/Config.h>
@@ -27,8 +28,12 @@
 #include "compiler/ManualScheduler.h"
 #include "compiler/HEScheduler.h"
 #include "compiler/StaticShapeInference.h"
+#include "compiler/pass/ConstantOutputPass.h"
+#include "compiler/pass/OddOutputPass.h"
+#include "compiler/pass/PassRunner.h"
 #include "exec/ExecTime.h"
 #include "ir/operation/LowerInfo.h"
+#include "ir/verifier/Verifier.h"
 #include "dumper/dot/DotDumper.h"
 #include "compiler/Linear.h"
 #include "interp/InterpExecutor.h"
@@ -132,6 +137,8 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
         backend::controlflow::Config::ID;
     _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] =
         backend::controlflow::Config::ID;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] =
+        backend::controlflow::Config::ID;
   }
 
   // FIXME This is a workaround for bcq operations, should remove it
@@ -159,10 +166,24 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     VERBOSE(Compiler) << std::noboolalpha;
   }
 
+  _subgraphs->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
+    // Mandatory passes
+    pass::PassRunner{}
+        .append(std::make_unique<pass::ConstantOutputPass>(subg))
+        .append(std::make_unique<pass::OddOutputPass>(subg))
+        .run();
+  });
+
   /***************************************************
    * Prepare compilation phase
    ***************************************************/
 
+  // Check shape independent operation feature
+  // - Operand type
+  // - Shape independent parameter
+  _subgraphs->iterate(
+      [](const onert::ir::SubgraphIndex &, const ir::Graph &subg) { OperationValidator{subg}(); });
+
   auto executors = std::make_shared<exec::ExecutorMap>();
 
   // Compilable check
@@ -229,17 +250,23 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     inferer.dump();
   }
 
-  /*************************************************************
-   *  Backend independent analysis & optimization phase finished
-   *************************************************************/
-
-  // operation validation
+  // Shape validation
+  // TODO Move shape independent feature check from ShapeValidator to OperationValidator
+  // TODO Move ShapeValidator into shape inference
+  //      - Check input tensor shape validation
+  //      - Check parameter value validation which valid value is depend on input tensor shape
+  //      - Output tensor shape validation check is needless because
+  //        static/dynamic shape inferer will make valid output shape
   for (auto &pair : lowered_subgs)
   {
     auto &lowered_subg = pair.second;
-    compiler::OperationValidator{lowered_subg->graph()}();
+    compiler::ShapeValidator{lowered_subg->graph()}();
   }
 
+  /*************************************************************
+   *  Backend independent analysis & optimization phase finished
+   *************************************************************/
+
   executors = std::make_shared<exec::ExecutorMap>();
   for (auto &pair : lowered_subgs)
   {
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index 062c6c9c3..bb325ffbc 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -29,6 +29,7 @@
 #include "backend/IConstantInitializer.h"
 #include "backend/IKernelGenerator.h"
 #include "backend/IOptimizer.h"
+#include "backend/IPortableTensor.h"
 #include "backend/ITensorRegister.h"
 #include "backend/controlflow/Config.h"
 #include "backend/controlflow/KernelGenerator.h"
@@ -65,23 +66,6 @@ private:
   std::shared_ptr<backend::IConfig> _config;
 };
 
-// TODO Think of a better way to manage TensorManagers
-backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders)
-{
-  backend::TensorManagerSet tensor_mgrs;
-  for (auto &tensor_builder : tensor_builders)
-  {
-    auto s_tensor_manager = tensor_builder->releaseStaticTensorManager();
-    if (s_tensor_manager != nullptr)
-      tensor_mgrs.insert(std::move(s_tensor_manager));
-
-    auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager();
-    if (d_tensor_manager != nullptr)
-      tensor_mgrs.insert(std::move(d_tensor_manager));
-  }
-  return tensor_mgrs;
-}
-
 } // namespace
 } // namespace onert
 
@@ -172,7 +156,8 @@ void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_grap
       for (const auto op_idx : op_seq)
       {
         const auto &op = lowered_graph->graph().operations().at(op_idx);
-        for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
+        for (const auto &index :
+             (op.getInputs() | ir::Remove::UNDEFINED) + (op.getOutputs() | ir::Remove::UNDEFINED))
         {
           if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
           {
@@ -200,11 +185,11 @@ void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_grap
   }
 }
 
-std::vector<std::shared_ptr<backend::ITensor>>
+std::vector<backend::ITensor *>
 ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
                                           const ir::OperandIndexSequence &indices)
 {
-  std::vector<std::shared_ptr<backend::ITensor>> ret;
+  std::vector<backend::ITensor *> ret;
 
   // TODO Store controlflow backend in BackendContext
   std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
@@ -227,19 +212,20 @@ ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
   for (auto ind : indices)
   {
     const auto &operand = lowered_graph.graph().operands().at(ind);
-    auto tensor = std::make_shared<backend::controlflow::UserTensor>(
+    auto tensor = std::make_unique<backend::controlflow::UserTensor>(
         operand.info(),
-        ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
-        cf_tensor_builder->dynamicTensorManager());
+        ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */
+        );
 
     // Add tensor to controlflow TensorRegistry.
-    cf_tensor_reg->setNativeUserTensor(ind, tensor);
-    ret.push_back(tensor);
+    cf_tensor_reg->setNativeUserTensor(ind, std::move(tensor));
+    auto *itensor = cf_tensor_reg->getITensor(ind);
+    ret.push_back(itensor);
   }
   return ret;
 }
 
-void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph)
+void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph)
 {
   TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
 
@@ -251,13 +237,13 @@ void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_gra
                             ir::Remove::UNDEFINED)
         {
           // If an OpSequence input/output tensor does not have a own tensor object,
-          // it must be using external tensors, so find the tensor from other tensor builders and
+          // it must be using migrant tensors, so find the tensor from other tensor builders and
           // set the tensor to this tensor builder if portable
           if (!backend_ctx->tensor_registry->getITensor(ind))
           {
             auto tensor = tensor_regs.getITensor(ind);
             assert(tensor); // The tensor must have been registered
-            auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
+            auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
             if (ptensor)
               backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
           }
@@ -299,8 +285,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
   auto order = Linear::linearize(*lowered_graph);
   runTensorRegistration(lowered_graph.get(), order);
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::ITensor *> output_tensors;
   if (options.is_primary_subgraph)
   {
     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
@@ -318,7 +304,7 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
     tensor_builder->prepare();
   }
 
-  prepareExternalTensors(*lowered_graph);
+  prepareMigrantTensors(*lowered_graph);
 
   ExecutionBuilder builder;
 
@@ -370,10 +356,9 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
     });
   }
 
-  backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
-  auto exec = new exec::LinearExecutor{
-      std::move(lowered_graph), input_tensors,       output_tensors, tensor_regs,
-      std::move(tensor_mgrs),   std::move(code_map), order};
+  auto exec =
+      new exec::LinearExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
+                               std::move(code_map),      order};
 
   if (!options.trace_filepath.empty())
   {
@@ -396,8 +381,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   auto order = Linear::linearize(*lowered_graph);
   runTensorRegistration(lowered_graph.get(), order);
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::ITensor *> output_tensors;
   if (options.is_primary_subgraph)
   {
     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
@@ -424,7 +409,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     tensor_builder->prepare();
   }
 
-  prepareExternalTensors(*lowered_graph);
+  prepareMigrantTensors(*lowered_graph);
 
   ExecutionBuilder builder;
 
@@ -477,20 +462,16 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     });
   }
 
-  backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
-
   exec::ExecutorBase *exec = nullptr;
   if (parallel)
   {
-    exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors,
-                                      output_tensors,           tensor_regs,
-                                      std::move(tensor_mgrs),   std::move(code_map)};
+    exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors,
+                                      tensor_regs, std::move(code_map)};
   }
   else
   {
-    auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors,
-                                                    output_tensors,           tensor_regs,
-                                                    std::move(tensor_mgrs),   std::move(code_map)};
+    auto dataflow_exec = new exec::DataflowExecutor{
+        std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, std::move(code_map)};
     if (options.he_profiling_mode)
     {
       std::vector<const backend::Backend *> backends;
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index b8893c03b..e76b721ea 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -46,10 +46,10 @@ private:
   static void initializeBackendContext(compiler::LoweredGraph *lowered_graph);
   static void runTensorRegistration(compiler::LoweredGraph *lowered_graph,
                                     const std::vector<ir::OpSequenceIndex> &order);
-  static std::vector<std::shared_ptr<backend::ITensor>>
+  static std::vector<backend::ITensor *>
   initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
                            const ir::OperandIndexSequence &indices);
-  static void prepareExternalTensors(compiler::LoweredGraph &lowered_graph);
+  static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph);
   static exec::IExecutor *
   createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
                        const compiler::CompilerOptions &options,
diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc
index 5653b090e..fe54b0fdd 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.cc
+++ b/runtime/onert/core/src/compiler/HEScheduler.cc
@@ -34,7 +34,8 @@ namespace compiler
 static uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::Operation &node)
 {
   uint32_t size = 0;
-  for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+  for (const auto &ind :
+       (node.getInputs() | ir::Remove::UNDEFINED) + (node.getOutputs() | ir::Remove::UNDEFINED))
   {
     size += graph.operands().at(ind).info().total_size();
   }
@@ -248,8 +249,9 @@ int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
   if (time != _exec_time->NOT_FOUND)
     return time;
 
+  // FIXME permute time is not recorded so the control reaches here always
   // Makes the scheduler prefer keeping computations on one backend
-  return size / 200;
+  return size / 400;
 }
 
 int64_t HEScheduler::tryBackend(const ir::Operation &node, const backend::Backend *backend)
@@ -370,7 +372,7 @@ int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
 {
   const auto &node = _graph->operations().at(index);
   int64_t max_child_rank = 0;
-  for (const auto &output : node.getOutputs())
+  for (const auto &output : node.getOutputs() | ir::Remove::UNDEFINED)
   {
     const auto &operand = _graph->operands().at(output);
     const bool quant = operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
diff --git a/runtime/onert/core/src/compiler/Linear.cc b/runtime/onert/core/src/compiler/Linear.cc
index 49a989500..39e58fe11 100644
--- a/runtime/onert/core/src/compiler/Linear.cc
+++ b/runtime/onert/core/src/compiler/Linear.cc
@@ -148,6 +148,9 @@ void Linear::planTensors(const compiler::LoweredGraph &lowered_graph,
     tensor_builder->notifyFirstUse(ind);
   }
 
+  const auto io_tensors =
+      (graph.getInputs() + graph.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
   // At each operation,
   // 1. Scan DEF of outputs. If the DEF, allocate it
   // 2. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
@@ -182,7 +185,15 @@ void Linear::planTensors(const compiler::LoweredGraph &lowered_graph,
           // plan for deallocation of dynamic tensor
           auto dyn_tensor_manager = tensor_builder_map[ind]->dynamicTensorManager();
           if (dyn_tensor_manager)
-            dyn_tensor_manager->planDealloc(op_idx, ind);
+          {
+            const auto *backend =
+                lowered_graph.getLowerInfo(ind)->def_factors().getOnlyElement().backend();
+            auto &tensor_registry = lowered_graph.backend_contexts().at(backend)->tensor_registry;
+            auto *tensor = tensor_registry->getITensor(ind);
+            assert(tensor);
+            if (!io_tensors.contains(ind)) // I/O tensors cannot be deallocated
+              dyn_tensor_manager->planDealloc(op_idx, tensor);
+          }
         }
       }
     }
diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc
index 1489a1884..cdf1a8158 100644
--- a/runtime/onert/core/src/compiler/LoweredGraph.cc
+++ b/runtime/onert/core/src/compiler/LoweredGraph.cc
@@ -21,6 +21,7 @@
 #include "util/logging.h"
 #include "compiler/pass/ConstantInsertionPass.h"
 #include "compiler/pass/ConstantLoweringPass.h"
+#include "compiler/pass/PassRunner.h"
 #include "compiler/pass/PermutationOperationPass.h"
 #include "compiler/pass/PermutationInsertionPass.h"
 #include "compiler/pass/PermutationEliminationPass.h"
@@ -101,14 +102,14 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
       std::reverse(std::begin(op_seq.operations()), std::end(op_seq.operations()));
     });
 
-    VERBOSE(OpSequences) << "dump without permutation" << std::endl;
+    VERBOSE(OpSequences) << "dump before permutation insertion" << std::endl;
     dumpOpSequences(_op_seqs, _graph.operations());
 
-    pass::ConstantInsertionPass ci_pass(*this);
-    ci_pass.run();
-
-    pass::ConstantLoweringPass cl_pass(*this);
-    cl_pass.run();
+    // Mandatory passes
+    pass::PassRunner{}
+        .append(std::make_unique<pass::ConstantInsertionPass>(*this))
+        .append(std::make_unique<pass::ConstantLoweringPass>(*this))
+        .run();
 
     // Set LowerInfo for each operand from the operand::LowerInfo holder
     manipulateLowerInfo(operands_lower_info, options.is_primary_subgraph);
@@ -116,20 +117,17 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
     dumpLowerInfo();
   }
 
-  // Run Permutation Passes
-  {
-    pass::PermutationOperationPass po_pass(*this);
-    po_pass.run();
-
-    pass::PermutationInsertionPass pi_pass(*this);
-    pi_pass.run();
+  // Mandatory passes
+  pass::PassRunner{}
+      .append(std::make_unique<pass::PermutationOperationPass>(*this))
+      .append(std::make_unique<pass::PermutationInsertionPass>(*this))
+      .run();
 
-    pass::PermutationEliminationPass pe_pass(*this);
-    pe_pass.run();
+  // Optimization passes
+  pass::PassRunner{}.append(std::make_unique<pass::PermutationEliminationPass>(*this)).run();
 
-    VERBOSE(OpSequences) << "dump with permutation" << std::endl;
-    dumpOpSequences(_op_seqs, _graph.operations());
-  }
+  VERBOSE(OpSequences) << "Dump after permutation insertion" << std::endl;
+  dumpOpSequences(_op_seqs, _graph.operations());
 
   // Graph verifications
   {
@@ -276,7 +274,7 @@ void LoweredGraph::makeOpSequences(
           auto &&lower_info = operands_lower_info.at(operand);
           lower_info->addUsePermuteFactor(ir::operand::PermuteFactor{backend, backend_layout});
         }
-        for (auto operand : node.getOutputs())
+        for (auto operand : node.getOutputs() | ir::Remove::UNDEFINED)
         {
           auto &&lower_info = operands_lower_info.at(operand);
           lower_info->addDefPermuteFactor(ir::operand::PermuteFactor{backend, backend_layout});
@@ -340,7 +338,7 @@ void LoweredGraph::manipulateLowerInfo(
       assert(lower_info->def_factors().empty());
       lower_info->addDefPermuteFactor(factor);
     }
-    for (auto index : _graph.getOutputs())
+    for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
     {
       auto &&lower_info = operands_lower_info.at(index);
       lower_info->addUsePermuteFactor(factor);
@@ -368,7 +366,7 @@ void LoweredGraph::manipulateLowerInfo(
       }
     }
   }
-  for (auto index : _graph.getOutputs())
+  for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
   {
     auto &&lower_info = operands_lower_info.at(index);
     if (lower_info->def_factors().size() == 0)
@@ -496,7 +494,7 @@ bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index,
     branched_set.clear();
 
     // Check for branching down
-    for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED)
+    for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       // TODO Fix this workaround for the case of model outputs that are used by another operation
       //      This is needed since the branching is decided by operation, but for model outputs,
@@ -544,7 +542,7 @@ bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index,
       }
 
       // node's input == op_seq's output?
-      for (const auto output : n.getOutputs())
+      for (const auto output : n.getOutputs() | ir::Remove::UNDEFINED)
       {
         if (node_inputs.contains(output))
         {
diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
index f7f659e3e..0582cf154 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.cc
+++ b/runtime/onert/core/src/compiler/OperationValidator.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,7 @@
 
 #include "OperationValidator.h"
 
-#include <typeinfo>
-
 #include "ir/Graph.h"
-#include "ir/operation/LowerInfo.h"
-
-#include "util/logging.h"
-#include "util/Utils.h"
 
 #define OP_REQUIRES(EXP)                                                                         \
   do                                                                                             \
@@ -37,33 +31,14 @@ namespace compiler
 {
 
 OperationValidator::OperationValidator(const ir::Graph &graph)
-    : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN}
+    : _graph{graph}, _ctx{graph.operands()}
 {
 }
 
-void OperationValidator::checkUnaryOp(const ir::Operation &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(0)};
-
-  // Check if I/O types match
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  // Check if I/O shapes match
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
 void OperationValidator::operator()()
 {
-  // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
-  // creating Compiler
   assert(_graph.subgraphs() == nullptr);
 
-  _current_op_seq_layout = _graph.layout();
-
   _graph.operations().iterate(
       [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
 }
@@ -72,50 +47,23 @@ void OperationValidator::visit(const ir::operation::BatchMatMul &node)
 {
   const auto lhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::LHS));
   const auto rhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::RHS));
-  const auto out_index{node.getOutputs().at(0)};
 
   // Constant lhs and rhs is not implemented yet
   OP_REQUIRES(!_ctx.at(lhs_index).isConstant() && !_ctx.at(rhs_index).isConstant());
-
-  if (_ctx.at(out_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() <= 4);
-  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() <= 4);
-  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() >= 2);
-  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() >= 2);
 }
 
 void OperationValidator::visit(const ir::operation::BatchToSpaceND &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-
-  // All requirement as per NNAPI specification.
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
-
-  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
-
+  // Non-constant block_size is not implemented yet
   OP_REQUIRES(_ctx.at(block_size_index).isConstant());
-
-  OP_REQUIRES(input_shape.C == output_shape.C);
 }
 
 void OperationValidator::visit(const ir::operation::Comparison &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  // This validator does not check shape. So checking isDynamic() is skipped.
 
   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
@@ -124,223 +72,20 @@ void OperationValidator::visit(const ir::operation::Comparison &node)
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::BOOL8);
 }
 
-void OperationValidator::visit(const ir::operation::Softmax &node)
-{
-  VERBOSE(Softmax) << "Configure SOFTMAX operation" << std::endl;
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
-}
-
-void OperationValidator::visit(const ir::operation::InstanceNorm &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
-  const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
-  const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
-
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ifm_index).shape() == _ctx.at(ofm_index).shape());
-  OP_REQUIRES(_ctx.at(gamma_index).shape().rank() == 1);
-  OP_REQUIRES(_ctx.at(beta_index).shape().rank() == 1);
-}
-
-void OperationValidator::visit(const ir::operation::Pool2D &node)
+void OperationValidator::visit(const ir::operation::DepthToSpace &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
+  int32_t block_size = node.param().block_size;
 
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(block_size > 0);
 }
 
-void OperationValidator::visit(const ir::operation::Permute &node)
+void OperationValidator::visit(const ir::operation::ElementwiseActivation &node)
 {
-  VERBOSE(Permute) << "Configure Permute operation" << std::endl;
-
   const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
   const auto input_index{node.getInputs().at(0)};
 
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
-}
-
-void OperationValidator::visit(const ir::operation::Reduce &node)
-{
-  VERBOSE(Permute) << "Configure " + node.name() + " operation" << std::endl;
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
-  const auto input_shape = _ctx.at(input_index).shape();
-  const auto output_shape = _ctx.at(output_index).shape();
-
-  OP_REQUIRES(input_shape.rank() <= 4);
-  OP_REQUIRES(output_shape.rank() <= input_shape.rank());
-
-  // NOTE For the 4-dimensions, if the rank of input and output are different, this runtime only
-  // supports cases reducing height and width or reducing depth.
-  // TODO We have to support all cases of dimensions up to 4.
-  // For correct permuting, we have to set output's shape to be equal in dimension position of the
-  // input. But the positions of the same dimensions in the input and output may be set differently.
-  // For example {2,3,4,5}(input's shape) can be reduced to {3,5}(output's shape). The original
-  // output shape should be {1,3,1,5}, but real output shape may be {3,5}. If you simply try to
-  // extend it in 4 dimensions, it should be {1,1,3,5}.
-  // Even if output shape is changed to {1,3,1,5}, there is another problem. It is that shape of
-  // output tensor used at next operation is changed to {1,3,1,5} after this operation even if the
-  // next operation is not desired.
-  if (input_shape.rank() == 4 && input_shape.rank() != output_shape.rank())
-  {
-    if (output_shape.rank() == 2)
-    {
-      // Reducing HW
-      OP_REQUIRES(input_shape.dim(0) == output_shape.dim(0) &&
-                  input_shape.dim(3) == output_shape.dim(1));
-    }
-    else if (output_shape.rank() == 3)
-    {
-      // Reducing C or
-      // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1)
-      OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) &&
-                   input_shape.dim(1) == output_shape.dim(1) &&
-                   input_shape.dim(2) == output_shape.dim(2)) ||
-                  (input_shape.dim(0) == output_shape.dim(0) &&
-                   (input_shape.dim(1) == output_shape.dim(1) ||
-                    input_shape.dim(2) == output_shape.dim(1)) &&
-                   input_shape.dim(3) == 1 && output_shape.dim(2) == 1));
-    }
-  }
-}
-
-void OperationValidator::visit(const ir::operation::Transpose &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
-  const auto &perm{node.param().perm};
-
-  const auto &output_shape = _ctx.at(output_index).shape();
-  const auto &input_shape = _ctx.at(input_index).shape();
-
-  OP_REQUIRES(input_shape.rank() == static_cast<int>(perm.size()));
-  OP_REQUIRES(input_shape.rank() == output_shape.rank());
-}
-
-void OperationValidator::visit(const ir::operation::RNN &node)
-{
-  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
-  // TODO Support dynamic rnn
-  const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto hidden_state_out_index{
-      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
-  const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
-  const auto recurrent_weights_index{
-      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
-  const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
-  const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
-
-  const auto batch_size = _ctx.at(output_index).shape().dim(0);
-  const auto num_units = _ctx.at(output_index).shape().dim(1);
-
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 2 &&
-              _ctx.at(hidden_state_out_index).shape().rank() == 2 &&
-              _ctx.at(input_index).shape().rank() == 2 &&
-              _ctx.at(weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_weights_index).shape().rank() == 2 &&
-              _ctx.at(hidden_state_in_index).shape().rank() == 2);
-  OP_REQUIRES(_ctx.at(bias_index).shape().rank() == 1);
-
-  OP_REQUIRES(batch_size == _ctx.at(input_index).shape().dim(0) &&
-              batch_size == _ctx.at(hidden_state_in_index).shape().dim(0) &&
-              batch_size == _ctx.at(hidden_state_out_index).shape().dim(0));
-  OP_REQUIRES(_ctx.at(input_index).shape().dim(1) == _ctx.at(weights_index).shape().dim(1));
-
-  OP_REQUIRES(num_units == _ctx.at(weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(bias_index).shape().dim(0));
-  OP_REQUIRES(num_units == _ctx.at(output_index).shape().dim(1) &&
-              num_units == _ctx.at(recurrent_weights_index).shape().dim(1) &&
-              num_units == _ctx.at(hidden_state_in_index).shape().dim(1) &&
-              num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
-}
-
-void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
-  const auto block_size_index{
-      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
-  const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-
-  // All requirement as per NNAPI specification.
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
-  OP_REQUIRES(_ctx.at(paddings_index).shape().rank() == 2);
-
-  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
-  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(0) == 2);
-  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(1) == 2);
-
-  OP_REQUIRES(_ctx.at(block_size_index).isConstant());
-  OP_REQUIRES(_ctx.at(paddings_index).isConstant());
-
-  OP_REQUIRES(input_shape.C == output_shape.C);
-}
-
-void OperationValidator::visit(const ir::operation::SpaceToDepth &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-  const auto block_size = node.param().block_size;
-
-  // All assertions as per NNAPI specification.
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES((block_size >= 1) && (input_shape.H % block_size == 0) &&
-              (input_shape.W % block_size == 0));
-  OP_REQUIRES(input_shape.N == output_shape.N);
-  OP_REQUIRES(input_shape.C * block_size * block_size == output_shape.C);
-}
-
-void OperationValidator::visit(const ir::operation::ElementwiseActivation &node)
-{
-  checkUnaryOp(node);
+  // Check if I/O types match
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
 }
 
 void OperationValidator::visit(const ir::operation::ElementwiseBinary &node)
@@ -358,9 +103,6 @@ void OperationValidator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  OP_REQUIRES(node.getInputs().size() == 1);
-  OP_REQUIRES(node.getOutputs().size() == 1);
-
   // Check if I/O types match
   if (node.param().op_type == ir::operation::ElementwiseUnary::Type::DEQUANTIZE)
   {
@@ -376,47 +118,13 @@ void OperationValidator::visit(const ir::operation::ElementwiseUnary &node)
   {
     OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
   }
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
 }
 
 void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
-  const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
-
-  const auto &output_obj = _ctx.at(output_index);
-  const auto &lookups_obj = _ctx.at(lookups_index);
-  const auto &values_obj = _ctx.at(values_index);
-
-  // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying
-  // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729)
-  {
-    OP_REQUIRES(lookups_obj.typeInfo().type() == ir::DataType::INT32);
-
-    if (_ctx.at(output_index).info().isDynamic())
-      return;
 
-    const auto &output_shape = output_obj.shape();
-    const auto &lookups_shape = lookups_obj.shape();
-    const auto &values_shape = values_obj.shape();
-
-    OP_REQUIRES(lookups_shape.rank() == 1);
-    OP_REQUIRES(values_shape.rank() >= 2);
-
-    // output should be a n-D tensor with the same rank and shape as the values tensor, except for
-    // the first dimension which has the same size as lookups' only dimension.
-    OP_REQUIRES(output_shape.rank() == values_shape.rank());
-    OP_REQUIRES(output_shape.dim(0) == lookups_shape.dim(0));
-    for (int n = 1; n < output_shape.rank(); ++n)
-    {
-      OP_REQUIRES(output_shape.dim(n) == values_shape.dim(n));
-    }
-  }
+  OP_REQUIRES(_ctx.at(lookups_index).typeInfo().type() == ir::DataType::INT32);
 }
 
 void OperationValidator::visit(const ir::operation::ExpandDims &node)
@@ -427,488 +135,35 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
 
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
   OP_REQUIRES(_ctx.at(axis_index).typeInfo().type() == ir::DataType::INT32);
-
-  if (_ctx.at(axis_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
 }
 
 void OperationValidator::visit(const ir::operation::HashtableLookup &node)
 {
-  const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
-
   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
-  const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
-
-  const auto &output_obj = _ctx.at(output_index);
-  const auto &hits_obj = _ctx.at(hits_index);
-
-  const auto &lookups_obj = _ctx.at(lookups_index);
-  const auto &keys_obj = _ctx.at(keys_index);
-  const auto &values_obj = _ctx.at(values_index);
-
-  OP_REQUIRES(lookups_obj.typeInfo().type() == ir::DataType::INT32);
-  OP_REQUIRES(keys_obj.typeInfo().type() == ir::DataType::INT32);
-  OP_REQUIRES(hits_obj.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
 
-  const auto &output_shape = output_obj.shape();
-  const auto &lookups_shape = lookups_obj.shape();
-  const auto &keys_shape = keys_obj.shape();
-  const auto &values_shape = values_obj.shape();
-
-  OP_REQUIRES(values_shape.rank() == output_shape.rank());
-  OP_REQUIRES(lookups_shape.rank() == 1);
-  OP_REQUIRES(keys_shape.rank() == 1);
-  OP_REQUIRES(values_shape.dim(0) == keys_shape.dim(0));
-  OP_REQUIRES(lookups_shape.dim(0) == output_shape.dim(0));
-}
-
-void OperationValidator::visit(const ir::operation::TransposeConv &node)
-{
-  // param check
-  OP_REQUIRES((node.param().padding.type == ir::PaddingType::SAME) ||
-              (node.param().padding.type == ir::PaddingType::VALID));
-
-  // shape check
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
-  const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
-
-  // Only 4D tensors are supported
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank());
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank());
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  // The kernel has only IHWO layout on frontend
-  // So ker_shape is treated here below
-  // I -> N
-  // H -> H
-  // W -> W
-  // O -> C
-  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC);
-
-  OP_REQUIRES(ifm_shape.N == ofm_shape.N);
-  OP_REQUIRES(ifm_shape.C == ker_shape.C);
-  OP_REQUIRES(ker_shape.N == ofm_shape.C);
-}
-
-void OperationValidator::visit(const ir::operation::Gather &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
-  const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape();
-  const auto indices_shape = _ctx.at(indices_index).shape();
-  const auto ofm_shape = _ctx.at(ofm_index).shape();
-
-  OP_REQUIRES(ifm_shape.rank() <= 4);
-  OP_REQUIRES(indices_shape.rank() <= 3);
-  OP_REQUIRES(ofm_shape.rank() <= 4);
-}
-
-void OperationValidator::visit(const ir::operation::DepthToSpace &node)
-{
-  // param check
-  int32_t block_size = node.param().block_size;
-
-  OP_REQUIRES(block_size > 0);
-
-  // shape check
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout);
-  const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout);
-
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
-
-  {
-    OP_REQUIRES(output_shape.N == input_shape.N);
-    OP_REQUIRES(output_shape.H == input_shape.H * block_size);
-    OP_REQUIRES(output_shape.W == input_shape.W * block_size);
-    OP_REQUIRES(input_shape.C % (block_size * block_size) == 0);
-    OP_REQUIRES(output_shape.C == input_shape.C / (block_size * block_size));
-  }
+  OP_REQUIRES(_ctx.at(lookups_index).typeInfo().type() == ir::DataType::INT32);
+  OP_REQUIRES(_ctx.at(keys_index).typeInfo().type() == ir::DataType::INT32);
+  OP_REQUIRES(_ctx.at(hits_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
 }
 
 void OperationValidator::visit(const ir::operation::Pack &node)
 {
-  // param check
   const auto num{node.param().num};
-  const auto axis{node.param().axis};
-  OP_REQUIRES(num == static_cast<int32_t>(node.getInputs().size()));
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  // shape check
-  const auto &output_shape = _ctx.at(output_index).shape();
-  const auto output_rank = static_cast<int32_t>(output_shape.rank());
 
-  const auto input1_index{node.getInputs().at(0)};
-  const auto input_shape = _ctx.at(input1_index).shape();
-
-  OP_REQUIRES(axis >= -output_rank && axis < output_rank);
-  for (const auto &index : node.getInputs())
-  {
-    OP_REQUIRES(input_shape == _ctx.at(index).shape());
-  }
-}
-
-void OperationValidator::visit(const ir::operation::LSTM &node)
-{
-  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
-  // TODO Support dynamic rnn
-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)};
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)};
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)};
-  const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)};
-  const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)};
-  const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
-
-  OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().rank() == 2 &&
-              _ctx.at(output_state_out_index).shape().rank() == 2 &&
-              _ctx.at(cell_state_out_index).shape().rank() == 2 &&
-              _ctx.at(output_index).shape().rank() == 2 &&
-              _ctx.at(input_index).shape().rank() == 2 &&
-              _ctx.at(input_to_input_weights_index).shape().rank() == 2 &&
-              _ctx.at(input_to_forget_weights_index).shape().rank() == 2 &&
-              _ctx.at(input_to_cell_weights_index).shape().rank() == 2 &&
-              _ctx.at(input_to_output_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 &&
-              _ctx.at(projection_weights_index).shape().rank() == 2 &&
-              _ctx.at(output_state_in_index).shape().rank() == 2 &&
-              _ctx.at(cell_state_in_index).shape().rank() == 2);
-
-  OP_REQUIRES(_ctx.at(cell_to_input_weights_index).shape().rank() == 1 &&
-              _ctx.at(cell_to_forget_weights_index).shape().rank() == 1 &&
-              _ctx.at(cell_to_output_weights_index).shape().rank() == 1 &&
-              _ctx.at(input_gate_bias_index).shape().rank() == 1 &&
-              _ctx.at(forget_gate_bias_index).shape().rank() == 1 &&
-              _ctx.at(cell_bias_index).shape().rank() == 1 &&
-              _ctx.at(output_gate_bias_index).shape().rank() == 1 &&
-              _ctx.at(projection_bias_index).shape().rank() == 1);
-
-  // CIFG assertion
-  OP_REQUIRES((_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 &&
-               _ctx.at(input_to_input_weights_index).shape().dim(1) == 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0 &&
-               _ctx.at(input_gate_bias_index).shape().dim(0) == 0 &&
-               _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0) ||
-              (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-               _ctx.at(input_to_input_weights_index).shape().dim(1) != 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0 &&
-               _ctx.at(input_gate_bias_index).shape().dim(0) != 0));
-
-  // Peephole assertion
-  OP_REQUIRES((_ctx.at(cell_to_forget_weights_index).shape().dim(0) == 0 &&
-               _ctx.at(cell_to_output_weights_index).shape().dim(0) == 0) ||
-              (_ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0 &&
-               _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0));
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_input_gate_bias = _ctx.at(input_gate_bias_index).shape().dim(0) != 0;
-  bool has_cell_to_input_weights = _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
-  // true: no CIFG
-  // false: CIFG
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_input_weights do not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE The projection weights may have data but the projection bias may not.
-  bool has_projection_param = has_projection_weights;
-
-  const auto batch_size = _ctx.at(input_index).shape().dim(0);
-  OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) &&
-              batch_size == _ctx.at(cell_state_in_index).shape().dim(0) &&
-              batch_size == _ctx.at(scratch_buffer_index).shape().dim(0) &&
-              batch_size == _ctx.at(output_state_out_index).shape().dim(0) &&
-              batch_size == _ctx.at(cell_state_out_index).shape().dim(0) &&
-              batch_size == _ctx.at(output_index).shape().dim(0));
-
-  const auto input_size = _ctx.at(input_index).shape().dim(1);
-  OP_REQUIRES(input_size == _ctx.at(input_to_forget_weights_index).shape().dim(1) &&
-              input_size == _ctx.at(input_to_cell_weights_index).shape().dim(1) &&
-              input_size == _ctx.at(input_to_output_weights_index).shape().dim(1));
-
-  const auto num_units = _ctx.at(cell_state_out_index).shape().dim(1);
-  OP_REQUIRES(num_units == _ctx.at(input_to_forget_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(input_to_cell_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(input_to_output_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_to_forget_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_to_cell_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_to_output_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(forget_gate_bias_index).shape().dim(0) &&
-              num_units == _ctx.at(cell_bias_index).shape().dim(0) &&
-              num_units == _ctx.at(output_gate_bias_index).shape().dim(0) &&
-              num_units == _ctx.at(cell_state_in_index).shape().dim(1) &&
-              (((num_units * 3) == _ctx.at(scratch_buffer_index).shape().dim(1)) ||
-               ((num_units * 4) == _ctx.at(scratch_buffer_index).shape().dim(1))));
-
-  const auto output_size = _ctx.at(output_index).shape().dim(1);
-  OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) &&
-              output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) &&
-              output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) &&
-              output_size == _ctx.at(output_state_in_index).shape().dim(1) &&
-              output_size == _ctx.at(output_state_out_index).shape().dim(1));
-
-  if (has_cifg_param)
-  {
-    OP_REQUIRES(input_size == _ctx.at(input_to_input_weights_index).shape().dim(1));
-    OP_REQUIRES(num_units == _ctx.at(input_to_input_weights_index).shape().dim(0) &&
-                num_units == _ctx.at(recurrent_to_input_weights_index).shape().dim(0) &&
-                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
-                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* non-peephole */) &&
-                num_units == _ctx.at(input_gate_bias_index).shape().dim(0));
-    OP_REQUIRES(output_size == _ctx.at(recurrent_to_input_weights_index).shape().dim(1));
-    OP_REQUIRES(has_input_to_input_weights && has_recurrent_to_input_weights &&
-                has_input_gate_bias);
-    if (has_cell_to_input_weights)
-    {
-      // NOTE The cell_to_input_weights exist only in case of non-CIFG and peephole.
-      OP_REQUIRES(has_peephole_param);
-    }
-    OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 4);
-  }
-  else
-  {
-    OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 3);
-  }
-
-  if (has_peephole_param)
-  {
-    OP_REQUIRES(num_units == _ctx.at(cell_to_forget_weights_index).shape().dim(0) &&
-                num_units == _ctx.at(cell_to_output_weights_index).shape().dim(0) &&
-                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
-                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* CIFG */));
-  }
-
-  if (has_projection_param)
-  {
-    OP_REQUIRES(num_units == _ctx.at(projection_weights_index).shape().dim(1));
-    OP_REQUIRES(output_size == _ctx.at(projection_weights_index).shape().dim(0));
-    if (has_projection_bias)
-    {
-      OP_REQUIRES(output_size == _ctx.at(projection_bias_index).shape().dim(0));
-    }
-  }
-}
-
-void OperationValidator::visit(const ir::operation::L2Normalization &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
-
-  auto ifm_shape = _ctx.at(ifm_index).shape();
-  auto ofm_shape = _ctx.at(ofm_index).shape();
-
-  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
-
-  for (auto i = 0; i < ifm_shape.rank(); i++)
-  {
-    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
-  }
-}
-
-void OperationValidator::visit(const ir::operation::Unpack &node)
-{
-  const auto num{node.param().num};
-  OP_REQUIRES(num == static_cast<int32_t>(node.getOutputs().size()));
-  const auto axis{node.param().axis};
-
-  const auto output_index{node.getInputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
-
-  const auto &input_shape = _ctx.at(input_index).shape();
-  const auto input_rank = static_cast<int32_t>(input_shape.rank());
-
-  OP_REQUIRES(axis >= -input_rank && axis < input_rank);
+  OP_REQUIRES(num == static_cast<int32_t>(node.getInputs().size()));
 }
 
 void OperationValidator::visit(const ir::operation::Pad &node)
 {
   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
-  OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32);
-
-  const auto output_index{node.getInputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
-
-  const auto &pad_shape = _ctx.at(pad_index).shape();
-  const auto input_rank = static_cast<int32_t>(_ctx.at(input_index).shape().rank());
-
-  OP_REQUIRES(pad_shape.rank() == 2);
-  OP_REQUIRES(pad_shape.dim(0) == input_rank);
-  OP_REQUIRES(pad_shape.dim(1) == 2);
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
-}
-
-void OperationValidator::visit(const ir::operation::Select &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  // This validator does not check shape. So checking isDynamic() is skipped.
-
-  const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)};
-  const auto input_true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
-  const auto input_false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
-  UNUSED_RELEASE(output_index);
-  UNUSED_RELEASE(input_true_index);
-  UNUSED_RELEASE(input_false_index);
-
-  OP_REQUIRES(_ctx.at(condition_index).typeInfo().type() == ir::DataType::BOOL8);
-}
-
-void OperationValidator::visit(const ir::operation::StridedSlice &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
-  const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
-  const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
-  const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
-
-  UNUSED_RELEASE(starts_index);
-  UNUSED_RELEASE(ends_index);
-  UNUSED_RELEASE(strides_index);
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() <= 4);
-}
 
-void OperationValidator::visit(const ir::operation::Split &node)
-{
-  const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
-
-  if (_ctx.at(input_index).info().isDynamic())
-    return;
-
-  const auto num_splits = node.param().num_splits;
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-  const auto axis = node.param().axis < 0 ? node.param().axis + input_rank : node.param().axis;
-
-  OP_REQUIRES(num_splits > 0 && num_splits <= 0xFFFF);
-  OP_REQUIRES(axis >= 0 && axis < input_rank);
-  OP_REQUIRES(node.getOutputs().size() == static_cast<uint32_t>(num_splits));
-
-  OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
-}
-
-void OperationValidator::visit(const ir::operation::Shape &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  UNUSED_RELEASE(input_index);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32);
 }
 
 void OperationValidator::visit(const ir::operation::ResizeBilinear &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
-
-  if (_ctx.at(output_index).info().isDynamic())
-  {
-    return;
-  }
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
-
   auto align_corners = node.param().align_corners;
   auto half_pixel_centers = node.param().half_pixel_centers;
 
@@ -923,23 +178,31 @@ void OperationValidator::visit(const ir::operation::Reverse &node)
 
   OP_REQUIRES(_ctx.at(axis_index).typeInfo().type() == ir::DataType::INT32);
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+}
+
+void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
+{
+  const auto block_size_index{
+      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+  const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+  // Non-constant block_size and padding is not implemented yet
+  OP_REQUIRES(_ctx.at(block_size_index).isConstant());
+  OP_REQUIRES(_ctx.at(paddings_index).isConstant());
 }
 
-void OperationValidator::visit(const ir::operation::If &)
+void OperationValidator::visit(const ir::operation::SpaceToDepth &node)
 {
-  // TODO Add to validate with subgraphs
+  const auto block_size = node.param().block_size;
+  OP_REQUIRES(block_size >= 1);
 }
 
-void OperationValidator::visit(const ir::operation::While &node)
+void OperationValidator::visit(const ir::operation::Split &node)
 {
-  // This validator does not check shape. So checking isDynamic() is skipped.
+  const auto num_splits = node.param().num_splits;
 
-  OP_REQUIRES(node.getInputs().size() == node.getOutputs().size());
-  // TODO Add to validate with subgraphs
+  OP_REQUIRES(num_splits > 0 && num_splits <= 0xFFFF);
+  OP_REQUIRES(node.getOutputs().size() == static_cast<uint32_t>(num_splits));
 }
 
 void OperationValidator::visit(const ir::operation::SquaredDifference &node)
@@ -948,105 +211,33 @@ void OperationValidator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  // Check for Type equivalence
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(lhs_index).typeInfo().type());
   OP_REQUIRES(_ctx.at(lhs_index).typeInfo().type() == _ctx.at(rhs_index).typeInfo().type());
-
-  // Check for dimension constraints
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  auto output_shape = _ctx.at(output_index).shape();
-  auto lhs_shape = _ctx.at(lhs_index).shape();
-  auto rhs_shape = _ctx.at(rhs_index).shape();
-  // Check for output rank
-  OP_REQUIRES(output_shape.rank() == std::max(lhs_shape.rank(), rhs_shape.rank()));
-  auto min_rank = std::min(lhs_shape.rank(), rhs_shape.rank());
-
-  for (int idx = 1; idx <= min_rank; idx++)
-  {
-    int l_idx = lhs_shape.rank() - idx;
-    int r_idx = rhs_shape.rank() - idx;
-    int out_idx = output_shape.rank() - idx;
-
-    OP_REQUIRES((l_idx >= 0) && (r_idx >= 0) && (out_idx >= 0));
-
-    auto l_dims = lhs_shape.dim(l_idx);
-    auto r_dims = rhs_shape.dim(r_idx);
-    auto out_dims = output_shape.dim(out_idx);
-
-    OP_REQUIRES(((l_dims == r_dims) && (out_dims == l_dims)) ||
-                ((l_dims == 1) && (out_dims == r_dims)) || ((r_dims == 1) && (out_dims == l_dims)));
-  }
-  auto &tmp_shape = (lhs_shape.rank() > rhs_shape.rank()) ? lhs_shape : rhs_shape;
-  for (int idx = min_rank + 1; idx <= output_shape.rank(); idx++)
-  {
-    int out_idx = output_shape.rank() - idx;
-    int tmp_idx = tmp_shape.rank() - idx;
-
-    OP_REQUIRES((out_idx >= 0) && (tmp_idx >= 0) &&
-                (output_shape.dim(out_idx) == tmp_shape.dim(tmp_idx)));
-  }
 }
-void OperationValidator::visit(const ir::operation::Tile &node)
+
+void OperationValidator::visit(const ir::operation::StridedSlice &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  const auto multiple_index{node.getInputs().at(1)};
+  const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 
-  OP_REQUIRES(_ctx.at(multiple_index).shape().rank() == 1);
-  OP_REQUIRES(_ctx.at(multiple_index).shape().dim(0) == _ctx.at(input_index).shape().rank());
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
 }
 
-void OperationValidator::visit(const ir::operation::Range &node)
+void OperationValidator::visit(const ir::operation::TransposeConv &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
-  const auto start_index{node.getInputs().at(ir::operation::Range::Input::START)};
-  const auto limit_index{node.getInputs().at(ir::operation::Range::Input::LIMIT)};
-  const auto delta_index{node.getInputs().at(ir::operation::Range::Input::DELTA)};
-
-  // Check for dimension constraints
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(start_index).shape().rank() == 0);
-  OP_REQUIRES(_ctx.at(limit_index).shape().rank() == 0);
-  OP_REQUIRES(_ctx.at(delta_index).shape().rank() == 0);
+  OP_REQUIRES((node.param().padding.type == ir::PaddingType::SAME) ||
+              (node.param().padding.type == ir::PaddingType::VALID));
 }
 
-void OperationValidator::visit(const ir::operation::MatrixBandPart &node)
+void OperationValidator::visit(const ir::operation::Unpack &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)};
-  const auto num_lower_index{
-      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)};
-  const auto num_upper_index{
-      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)};
-
-  // Check for dimension constraints
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() >= 2);     // input must be more than 2 dim matrix
-  OP_REQUIRES(_ctx.at(num_upper_index).shape().rank() == 0); // num_lower must be scalar
-  OP_REQUIRES(_ctx.at(num_lower_index).shape().rank() == 0); // num_upper must be scalar
+  const auto num{node.param().num};
+  OP_REQUIRES(num == static_cast<int32_t>(node.getOutputs().size()));
 }
 
-void OperationValidator::visit(const ir::operation::LogSoftmax &node)
+void OperationValidator::visit(const ir::operation::While &node)
 {
-  VERBOSE(LogSoftmax) << "Configure LOGSOFTMAX operation" << std::endl;
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+  OP_REQUIRES(node.getInputs().size() == node.getOutputs().size());
 }
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
index deb6357bb..f884a3765 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.h
+++ b/runtime/onert/core/src/compiler/OperationValidator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #ifndef __ONERT_COMPILER_OPERATION_VALIDATOR_H__
 #define __ONERT_COMPILER_OPERATION_VALIDATOR_H__
 
-#include "ir/Layout.h"
 #include "ir/OperationVisitor.h"
 
 namespace onert
@@ -47,51 +46,30 @@ public:
   void visit(const ir::operation::BatchMatMul &node) override;
   void visit(const ir::operation::BatchToSpaceND &node) override;
   void visit(const ir::operation::Comparison &node) override;
-  void visit(const ir::operation::Softmax &node) override;
-  void visit(const ir::operation::InstanceNorm &node) override;
-  void visit(const ir::operation::Permute &node) override;
-  void visit(const ir::operation::Pool2D &node) override;
-  void visit(const ir::operation::Reduce &node) override;
-  void visit(const ir::operation::Transpose &node) override;
-  void visit(const ir::operation::RNN &node) override;
-  void visit(const ir::operation::SpaceToBatchND &node) override;
-  void visit(const ir::operation::SpaceToDepth &node) override;
+  void visit(const ir::operation::DepthToSpace &node) override;
   void visit(const ir::operation::ElementwiseActivation &node) override;
   void visit(const ir::operation::ElementwiseBinary &node) override;
   void visit(const ir::operation::ElementwiseUnary &node) override;
   void visit(const ir::operation::EmbeddingLookup &node) override;
   void visit(const ir::operation::ExpandDims &node) override;
   void visit(const ir::operation::HashtableLookup &node) override;
-  void visit(const ir::operation::TransposeConv &node) override;
-  void visit(const ir::operation::Gather &node) override;
-  void visit(const ir::operation::DepthToSpace &node) override;
   void visit(const ir::operation::Pack &node) override;
-  void visit(const ir::operation::LSTM &node) override;
-  void visit(const ir::operation::L2Normalization &node) override;
-  void visit(const ir::operation::Unpack &node) override;
   void visit(const ir::operation::Pad &node) override;
-  void visit(const ir::operation::Select &node) override;
-  void visit(const ir::operation::StridedSlice &node) override;
-  void visit(const ir::operation::Split &node) override;
-  void visit(const ir::operation::Shape &node) override;
   void visit(const ir::operation::ResizeBilinear &node) override;
   void visit(const ir::operation::Reverse &node) override;
-  void visit(const ir::operation::If &node) override;
-  void visit(const ir::operation::While &node) override;
+  void visit(const ir::operation::SpaceToBatchND &node) override;
+  void visit(const ir::operation::SpaceToDepth &node) override;
+  void visit(const ir::operation::Split &node) override;
   void visit(const ir::operation::SquaredDifference &node) override;
-  void visit(const ir::operation::Tile &node) override;
-  void visit(const ir::operation::Range &node) override;
-  void visit(const ir::operation::MatrixBandPart &node) override;
-  void visit(const ir::operation::LogSoftmax &node) override;
-
-private:
-  void checkUnaryOp(const ir::Operation &node);
+  void visit(const ir::operation::StridedSlice &node) override;
+  void visit(const ir::operation::TransposeConv &node) override;
+  void visit(const ir::operation::Unpack &node) override;
+  void visit(const ir::operation::While &node) override;
 
 private:
   // TODO Remove _ctx field
   const ir::Graph &_graph;
   const ir::Operands &_ctx;
-  ir::Layout _current_op_seq_layout;
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.cc b/runtime/onert/core/src/compiler/ShapeValidator.cc
new file mode 100644
index 000000000..8be4fe6ec
--- /dev/null
+++ b/runtime/onert/core/src/compiler/ShapeValidator.cc
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ShapeValidator.h"
+
+#include <typeinfo>
+
+#include "ir/Graph.h"
+#include "ir/operation/LowerInfo.h"
+
+#include "util/logging.h"
+#include "util/Utils.h"
+
+#define OP_REQUIRES(EXP)                                                                     \
+  do                                                                                         \
+  {                                                                                          \
+    if (!(EXP))                                                                              \
+      throw std::runtime_error("ShapeValidator failed at line " + std::to_string(__LINE__)); \
+  } while (0)
+
+namespace onert
+{
+namespace compiler
+{
+
+ShapeValidator::ShapeValidator(const ir::Graph &graph)
+    : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN}
+{
+}
+
+void ShapeValidator::checkUnaryOp(const ir::Operation &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(0)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  // Check if I/O shapes match
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
+void ShapeValidator::operator()()
+{
+  // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
+  // creating Compiler
+  assert(_graph.subgraphs() == nullptr);
+
+  _current_op_seq_layout = _graph.layout();
+
+  _graph.operations().iterate(
+      [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
+}
+
+void ShapeValidator::visit(const ir::operation::BatchMatMul &node)
+{
+  const auto lhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::LHS));
+  const auto rhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::RHS));
+  const auto out_index{node.getOutputs().at(0)};
+
+  if (_ctx.at(out_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() <= 4);
+  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() <= 4);
+  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() >= 2);
+  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() >= 2);
+}
+
+void ShapeValidator::visit(const ir::operation::BatchToSpaceND &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
+  const auto block_size_index{
+      node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+
+  // All requirement as per NNAPI specification.
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
+
+  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
+
+  OP_REQUIRES(input_shape.C == output_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::BCQFullyConnected &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
+  const auto weight_scales_index{
+      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_SCALES)};
+  const auto weight_binary_index{
+      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_BINARY)};
+  const auto weight_cluster_index{
+      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+  // const auto bias_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::BIAS)};
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(weight_scales_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(weight_binary_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(weight_cluster_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().dim(1) == _ctx.at(ofm_index).shape().dim(1));
+
+  OP_REQUIRES(_ctx.at(weight_cluster_index).shape().dim(0) > 0);
+  OP_REQUIRES(_ctx.at(weight_cluster_index).shape().dim(1) == 2);
+
+  // more shape validation will be done inside kernel.
+
+  // TODO Check bias dimension (can be null tensor)
+}
+
+void ShapeValidator::visit(const ir::operation::BCQGather &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto indices_index{node.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto input_binary_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)};
+  const auto input_scales_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_SCALES)};
+  const auto input_clusters_index{
+      node.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+
+  OP_REQUIRES(_ctx.at(indices_index).shape().rank() <= 2); // TODO : support rank up to 4 or more
+  OP_REQUIRES(_ctx.at(input_binary_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(input_scales_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(input_clusters_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(input_clusters_index).shape().dim(0) > 0);
+  OP_REQUIRES(_ctx.at(input_clusters_index).shape().dim(1) == 2);
+
+  // more shape validation will be done inside kernel.
+}
+
+void ShapeValidator::visit(const ir::operation::Comparison &)
+{
+  // TODO Shape validation of comparison
+}
+
+void ShapeValidator::visit(const ir::operation::Softmax &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::InstanceNorm &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
+  const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+  const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ifm_index).shape() == _ctx.at(ofm_index).shape());
+  OP_REQUIRES(_ctx.at(gamma_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(beta_index).shape().rank() == 1);
+}
+
+void ShapeValidator::visit(const ir::operation::Pool2D &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+}
+
+void ShapeValidator::visit(const ir::operation::Permute &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::Reduce &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
+  const auto input_shape = _ctx.at(input_index).shape();
+  const auto output_shape = _ctx.at(output_index).shape();
+
+  OP_REQUIRES(input_shape.rank() <= 4);
+  OP_REQUIRES(output_shape.rank() <= input_shape.rank());
+
+  // NOTE For the 4-dimensions, if the rank of input and output are different, this runtime only
+  // supports cases reducing height and width or reducing depth.
+  // TODO We have to support all cases of dimensions up to 4.
+  // For correct permuting, we have to set output's shape to be equal in dimension position of the
+  // input. But the positions of the same dimensions in the input and output may be set differently.
+  // For example {2,3,4,5}(input's shape) can be reduced to {3,5}(output's shape). The original
+  // output shape should be {1,3,1,5}, but real output shape may be {3,5}. If you simply try to
+  // extend it in 4 dimensions, it should be {1,1,3,5}.
+  // Even if output shape is changed to {1,3,1,5}, there is another problem. It is that shape of
+  // output tensor used at next operation is changed to {1,3,1,5} after this operation even if the
+  // next operation is not desired.
+  if (input_shape.rank() == 4 && input_shape.rank() != output_shape.rank())
+  {
+    if (output_shape.rank() == 2)
+    {
+      // Reducing HW
+      OP_REQUIRES(input_shape.dim(0) == output_shape.dim(0) &&
+                  input_shape.dim(3) == output_shape.dim(1));
+    }
+    else if (output_shape.rank() == 3)
+    {
+      // Reducing C or
+      // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1)
+      OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) &&
+                   input_shape.dim(1) == output_shape.dim(1) &&
+                   input_shape.dim(2) == output_shape.dim(2)) ||
+                  (input_shape.dim(0) == output_shape.dim(0) &&
+                   (input_shape.dim(1) == output_shape.dim(1) ||
+                    input_shape.dim(2) == output_shape.dim(1)) &&
+                   input_shape.dim(3) == 1 && output_shape.dim(2) == 1));
+    }
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::Transpose &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+  const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
+
+  const auto &output_shape = _ctx.at(output_index).shape();
+  const auto &input_shape = _ctx.at(input_index).shape();
+
+  OP_REQUIRES(_ctx.at(perm_index).shape().num_elements() == 0 ||
+              input_shape.rank() == static_cast<int>(_ctx.at(perm_index).shape().num_elements()));
+  OP_REQUIRES(input_shape.rank() == output_shape.rank());
+}
+
+void ShapeValidator::visit(const ir::operation::RNN &node)
+{
+  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
+  // TODO Support dynamic rnn
+  const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto hidden_state_out_index{
+      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
+
+  const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
+  const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
+  const auto recurrent_weights_index{
+      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
+  const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
+  const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
+
+  const auto batch_size = _ctx.at(output_index).shape().dim(0);
+  const auto num_units = _ctx.at(output_index).shape().dim(1);
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 2 &&
+              _ctx.at(hidden_state_out_index).shape().rank() == 2 &&
+              _ctx.at(input_index).shape().rank() == 2 &&
+              _ctx.at(weights_index).shape().rank() == 2 &&
+              _ctx.at(recurrent_weights_index).shape().rank() == 2 &&
+              _ctx.at(hidden_state_in_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(bias_index).shape().rank() == 1);
+
+  OP_REQUIRES(batch_size == _ctx.at(input_index).shape().dim(0) &&
+              batch_size == _ctx.at(hidden_state_in_index).shape().dim(0) &&
+              batch_size == _ctx.at(hidden_state_out_index).shape().dim(0));
+  OP_REQUIRES(_ctx.at(input_index).shape().dim(1) == _ctx.at(weights_index).shape().dim(1));
+
+  OP_REQUIRES(num_units == _ctx.at(weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(bias_index).shape().dim(0));
+  OP_REQUIRES(num_units == _ctx.at(output_index).shape().dim(1) &&
+              num_units == _ctx.at(recurrent_weights_index).shape().dim(1) &&
+              num_units == _ctx.at(hidden_state_in_index).shape().dim(1) &&
+              num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
+}
+
+void ShapeValidator::visit(const ir::operation::SpaceToBatchND &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
+  const auto block_size_index{
+      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+  const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+
+  // All requirement as per NNAPI specification.
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(paddings_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
+  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(0) == 2);
+  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(1) == 2);
+
+  OP_REQUIRES(input_shape.C == output_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::SpaceToDepth &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+  const auto block_size = node.param().block_size;
+
+  // All assertions as per NNAPI specification.
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES((input_shape.H % block_size == 0) && (input_shape.W % block_size == 0));
+  OP_REQUIRES(input_shape.N == output_shape.N);
+  OP_REQUIRES(input_shape.C * block_size * block_size == output_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::ElementwiseActivation &node) { checkUnaryOp(node); }
+
+void ShapeValidator::visit(const ir::operation::ElementwiseBinary &)
+{
+  // TODO Shape validation of ElementwiseBinary
+}
+
+void ShapeValidator::visit(const ir::operation::ElementwiseUnary &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
+void ShapeValidator::visit(const ir::operation::EmbeddingLookup &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+  const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+
+  const auto &output_obj = _ctx.at(output_index);
+  const auto &lookups_obj = _ctx.at(lookups_index);
+  const auto &values_obj = _ctx.at(values_index);
+
+  // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying
+  // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729)
+  {
+    if (_ctx.at(output_index).info().isDynamic())
+      return;
+
+    const auto &output_shape = output_obj.shape();
+    const auto &lookups_shape = lookups_obj.shape();
+    const auto &values_shape = values_obj.shape();
+
+    OP_REQUIRES(lookups_shape.rank() == 1);
+    OP_REQUIRES(values_shape.rank() >= 2);
+
+    // output should be a n-D tensor with the same rank and shape as the values tensor, except for
+    // the first dimension which has the same size as lookups' only dimension.
+    OP_REQUIRES(output_shape.rank() == values_shape.rank());
+    OP_REQUIRES(output_shape.dim(0) == lookups_shape.dim(0));
+    for (int n = 1; n < output_shape.rank(); ++n)
+    {
+      OP_REQUIRES(output_shape.dim(n) == values_shape.dim(n));
+    }
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::ExpandDims &node)
+{
+  const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
+
+  if (_ctx.at(axis_index).info().isDynamic())
+    return;
+  OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
+}
+
+void ShapeValidator::visit(const ir::operation::HashtableLookup &node)
+{
+  const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
+  const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
+  const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+  const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+
+  const auto &output_obj = _ctx.at(output_index);
+  const auto &lookups_obj = _ctx.at(lookups_index);
+  const auto &keys_obj = _ctx.at(keys_index);
+  const auto &values_obj = _ctx.at(values_index);
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto &output_shape = output_obj.shape();
+  const auto &lookups_shape = lookups_obj.shape();
+  const auto &keys_shape = keys_obj.shape();
+  const auto &values_shape = values_obj.shape();
+
+  OP_REQUIRES(values_shape.rank() == output_shape.rank());
+  OP_REQUIRES(lookups_shape.rank() == 1);
+  OP_REQUIRES(keys_shape.rank() == 1);
+  OP_REQUIRES(values_shape.dim(0) == keys_shape.dim(0));
+  OP_REQUIRES(lookups_shape.dim(0) == output_shape.dim(0));
+}
+
+void ShapeValidator::visit(const ir::operation::TransposeConv &node)
+{
+  // shape check
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
+
+  // Only 4D tensors are supported
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank());
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank());
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  // The kernel has only IHWO layout on frontend
+  // So ker_shape is treated here below
+  // I -> N
+  // H -> H
+  // W -> W
+  // O -> C
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC);
+
+  OP_REQUIRES(ifm_shape.N == ofm_shape.N);
+  OP_REQUIRES(ifm_shape.C == ker_shape.C);
+  OP_REQUIRES(ker_shape.N == ofm_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::Gather &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+  const auto indices_shape = _ctx.at(indices_index).shape();
+  const auto ofm_shape = _ctx.at(ofm_index).shape();
+
+  OP_REQUIRES(ifm_shape.rank() <= 4);
+  OP_REQUIRES(indices_shape.rank() <= 3);
+  OP_REQUIRES(ofm_shape.rank() <= 4);
+}
+
+void ShapeValidator::visit(const ir::operation::DepthToSpace &node)
+{
+  int32_t block_size = node.param().block_size;
+
+  // shape check
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout);
+  const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout);
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
+
+  {
+    OP_REQUIRES(output_shape.N == input_shape.N);
+    OP_REQUIRES(output_shape.H == input_shape.H * block_size);
+    OP_REQUIRES(output_shape.W == input_shape.W * block_size);
+    OP_REQUIRES(input_shape.C % (block_size * block_size) == 0);
+    OP_REQUIRES(output_shape.C == input_shape.C / (block_size * block_size));
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::Pack &node)
+{
+  const auto axis{node.param().axis};
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  // shape check
+  const auto &output_shape = _ctx.at(output_index).shape();
+  const auto output_rank = static_cast<int32_t>(output_shape.rank());
+
+  const auto input1_index{node.getInputs().at(0)};
+  const auto input_shape = _ctx.at(input1_index).shape();
+
+  OP_REQUIRES(axis >= -output_rank && axis < output_rank);
+  for (const auto &index : node.getInputs())
+  {
+    OP_REQUIRES(input_shape == _ctx.at(index).shape());
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::LSTM &node)
+{
+  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
+  // TODO Support dynamic rnn
+  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto scratch_buffer_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+  const auto output_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+  const auto cell_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+
+  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto input_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+  const auto input_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+  const auto input_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+  const auto input_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+  const auto recurrent_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+  const auto recurrent_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+  const auto recurrent_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto cell_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)};
+  const auto cell_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)};
+  const auto cell_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)};
+  const auto input_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+  const auto forget_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+  const auto output_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+  const auto projection_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)};
+  const auto projection_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)};
+  const auto output_state_in_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+  for (int i = 0; i < _ctx.at(input_index).shape().rank() - 1; ++i)
+  {
+    OP_REQUIRES(_ctx.at(input_index).shape().dim(i) == _ctx.at(output_index).shape().dim(i));
+  }
+  OP_REQUIRES(
+      (_ctx.at(output_index).shape().rank() == 2 || _ctx.at(output_index).shape().rank() == 3) &&
+      (_ctx.at(input_index).shape().rank() == 2 || _ctx.at(input_index).shape().rank() == 3) &&
+      _ctx.at(input_to_input_weights_index).shape().rank() == 2 &&
+      _ctx.at(input_to_forget_weights_index).shape().rank() == 2 &&
+      _ctx.at(input_to_cell_weights_index).shape().rank() == 2 &&
+      _ctx.at(input_to_output_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 &&
+      _ctx.at(projection_weights_index).shape().rank() == 2 &&
+      _ctx.at(output_state_in_index).shape().rank() == 2 &&
+      _ctx.at(cell_state_in_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(cell_to_input_weights_index).shape().rank() == 1 &&
+              _ctx.at(cell_to_forget_weights_index).shape().rank() == 1 &&
+              _ctx.at(cell_to_output_weights_index).shape().rank() == 1 &&
+              _ctx.at(input_gate_bias_index).shape().rank() == 1 &&
+              _ctx.at(forget_gate_bias_index).shape().rank() == 1 &&
+              _ctx.at(cell_bias_index).shape().rank() == 1 &&
+              _ctx.at(output_gate_bias_index).shape().rank() == 1 &&
+              _ctx.at(projection_bias_index).shape().rank() == 1);
+
+  // CIFG assertion
+  OP_REQUIRES((_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 &&
+               _ctx.at(input_to_input_weights_index).shape().dim(1) == 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0 &&
+               _ctx.at(input_gate_bias_index).shape().dim(0) == 0 &&
+               _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0) ||
+              (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+               _ctx.at(input_to_input_weights_index).shape().dim(1) != 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0 &&
+               _ctx.at(input_gate_bias_index).shape().dim(0) != 0));
+
+  // Peephole assertion
+  OP_REQUIRES((_ctx.at(cell_to_forget_weights_index).shape().dim(0) == 0 &&
+               _ctx.at(cell_to_output_weights_index).shape().dim(0) == 0) ||
+              (_ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0 &&
+               _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0));
+
+  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+  bool has_recurrent_to_input_weights =
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+  bool has_input_gate_bias = _ctx.at(input_gate_bias_index).shape().dim(0) != 0;
+  bool has_cell_to_input_weights = _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
+  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+
+  // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
+  // true: no CIFG
+  // false: CIFG
+  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+  // NOTE The cell_to_input_weights do not exist in regular CIFG although peephole.
+  // true: peephole
+  // false: no peephole
+  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+  // NOTE The projection weights may have data but the projection bias may not.
+  bool has_projection_param = has_projection_weights;
+
+  const auto batch_size = (_ctx.at(input_index).shape().rank() == 3 && node.param().time_major)
+                              ? _ctx.at(input_index).shape().dim(1)
+                              : _ctx.at(input_index).shape().dim(0);
+  OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) &&
+              batch_size == _ctx.at(cell_state_in_index).shape().dim(0));
+
+  const auto input_size = _ctx.at(input_index).shape().dim(_ctx.at(input_index).shape().rank() - 1);
+  OP_REQUIRES(input_size == _ctx.at(input_to_forget_weights_index).shape().dim(1) &&
+              input_size == _ctx.at(input_to_cell_weights_index).shape().dim(1) &&
+              input_size == _ctx.at(input_to_output_weights_index).shape().dim(1));
+
+  const auto num_units = _ctx.at(input_to_output_weights_index).shape().dim(0);
+  OP_REQUIRES(num_units == _ctx.at(input_to_cell_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(input_to_output_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_to_forget_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_to_cell_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_to_output_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(forget_gate_bias_index).shape().dim(0) &&
+              num_units == _ctx.at(cell_bias_index).shape().dim(0) &&
+              num_units == _ctx.at(output_gate_bias_index).shape().dim(0) &&
+              num_units == _ctx.at(cell_state_in_index).shape().dim(1));
+
+  const auto output_size =
+      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+  OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) &&
+              output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) &&
+              output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) &&
+              output_size == _ctx.at(output_state_in_index).shape().dim(1));
+
+  if (has_cifg_param)
+  {
+    OP_REQUIRES(input_size == _ctx.at(input_to_input_weights_index).shape().dim(1));
+    OP_REQUIRES(num_units == _ctx.at(input_to_input_weights_index).shape().dim(0) &&
+                num_units == _ctx.at(recurrent_to_input_weights_index).shape().dim(0) &&
+                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
+                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* non-peephole */) &&
+                num_units == _ctx.at(input_gate_bias_index).shape().dim(0));
+    OP_REQUIRES(output_size == _ctx.at(recurrent_to_input_weights_index).shape().dim(1));
+    OP_REQUIRES(has_input_to_input_weights && has_recurrent_to_input_weights &&
+                has_input_gate_bias);
+    if (has_cell_to_input_weights)
+    {
+      // NOTE The cell_to_input_weights exist only in case of non-CIFG and peephole.
+      OP_REQUIRES(has_peephole_param);
+    }
+    if (_ctx.exist(scratch_buffer_index))
+      OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 4);
+  }
+  else
+  {
+    if (_ctx.exist(scratch_buffer_index))
+      OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 3);
+  }
+
+  if (has_peephole_param)
+  {
+    OP_REQUIRES(num_units == _ctx.at(cell_to_forget_weights_index).shape().dim(0) &&
+                num_units == _ctx.at(cell_to_output_weights_index).shape().dim(0) &&
+                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
+                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* CIFG */));
+  }
+
+  if (has_projection_param)
+  {
+    OP_REQUIRES(num_units == _ctx.at(projection_weights_index).shape().dim(1));
+    OP_REQUIRES(output_size == _ctx.at(projection_weights_index).shape().dim(0));
+    if (has_projection_bias)
+    {
+      OP_REQUIRES(output_size == _ctx.at(projection_bias_index).shape().dim(0));
+    }
+  }
+
+  if (_ctx.exist(scratch_buffer_index))
+  {
+    OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().rank() == 2);
+    OP_REQUIRES(batch_size == _ctx.at(scratch_buffer_index).shape().dim(0));
+  }
+
+  if (_ctx.exist(output_state_out_index))
+  {
+    OP_REQUIRES(_ctx.at(output_state_out_index).shape().rank() == 2);
+    OP_REQUIRES(batch_size == _ctx.at(output_state_out_index).shape().dim(0));
+    OP_REQUIRES(output_size == _ctx.at(output_state_out_index).shape().dim(1));
+  }
+
+  if (_ctx.exist(cell_state_out_index))
+  {
+    OP_REQUIRES(_ctx.at(cell_state_out_index).shape().rank() == 2);
+    OP_REQUIRES(batch_size == _ctx.at(cell_state_out_index).shape().dim(0));
+    OP_REQUIRES(num_units == _ctx.at(cell_state_out_index).shape().dim(1));
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::L2Normalization &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+
+  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
+
+  for (auto i = 0; i < ifm_shape.rank(); i++)
+  {
+    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::Unpack &node)
+{
+  const auto axis{node.param().axis};
+  const auto output_index{node.getInputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
+
+  const auto &input_shape = _ctx.at(input_index).shape();
+  const auto input_rank = static_cast<int32_t>(input_shape.rank());
+
+  OP_REQUIRES(axis >= -input_rank && axis < input_rank);
+}
+
+void ShapeValidator::visit(const ir::operation::Pad &node)
+{
+  const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
+  OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32);
+
+  const auto output_index{node.getInputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
+
+  const auto &pad_shape = _ctx.at(pad_index).shape();
+  const auto input_rank = static_cast<int32_t>(_ctx.at(input_index).shape().rank());
+
+  OP_REQUIRES(pad_shape.rank() == 2);
+  OP_REQUIRES(pad_shape.dim(0) == input_rank);
+  OP_REQUIRES(pad_shape.dim(1) == 2);
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::Select &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  // This validator does not check shape. So checking isDynamic() is skipped.
+
+  const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)};
+  const auto input_true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
+  const auto input_false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
+  UNUSED_RELEASE(output_index);
+  UNUSED_RELEASE(input_true_index);
+  UNUSED_RELEASE(input_false_index);
+
+  OP_REQUIRES(_ctx.at(condition_index).typeInfo().type() == ir::DataType::BOOL8);
+}
+
+void ShapeValidator::visit(const ir::operation::StridedSlice &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() <= 4);
+}
+
+void ShapeValidator::visit(const ir::operation::Split &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
+
+  const auto num_splits = node.param().num_splits;
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+  auto axis = *reinterpret_cast<const int32_t *>(_ctx.at(axis_index).data()->base());
+  axis = axis < 0 ? axis + input_rank : axis;
+
+  OP_REQUIRES(axis >= 0 && axis < input_rank);
+  OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
+}
+
+void ShapeValidator::visit(const ir::operation::Shape &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+  UNUSED_RELEASE(input_index);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1);
+}
+
+void ShapeValidator::visit(const ir::operation::ResizeBilinear &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+  {
+    return;
+  }
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
+}
+
+void ShapeValidator::visit(const ir::operation::Reverse &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
+void ShapeValidator::visit(const ir::operation::If &)
+{
+  // TODO Add to validate with subgraphs
+}
+
+void ShapeValidator::visit(const ir::operation::While &)
+{
+  // This validator does not check shape. So checking isDynamic() is skipped.
+  // TODO Add to validate with subgraphs
+}
+
+void ShapeValidator::visit(const ir::operation::SquaredDifference &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+  // Check for dimension constraints
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  auto output_shape = _ctx.at(output_index).shape();
+  auto lhs_shape = _ctx.at(lhs_index).shape();
+  auto rhs_shape = _ctx.at(rhs_index).shape();
+  // Check for output rank
+  OP_REQUIRES(output_shape.rank() == std::max(lhs_shape.rank(), rhs_shape.rank()));
+  auto min_rank = std::min(lhs_shape.rank(), rhs_shape.rank());
+
+  for (int idx = 1; idx <= min_rank; idx++)
+  {
+    int l_idx = lhs_shape.rank() - idx;
+    int r_idx = rhs_shape.rank() - idx;
+    int out_idx = output_shape.rank() - idx;
+
+    OP_REQUIRES((l_idx >= 0) && (r_idx >= 0) && (out_idx >= 0));
+
+    auto l_dims = lhs_shape.dim(l_idx);
+    auto r_dims = rhs_shape.dim(r_idx);
+    auto out_dims = output_shape.dim(out_idx);
+
+    OP_REQUIRES(((l_dims == r_dims) && (out_dims == l_dims)) ||
+                ((l_dims == 1) && (out_dims == r_dims)) || ((r_dims == 1) && (out_dims == l_dims)));
+  }
+  auto &tmp_shape = (lhs_shape.rank() > rhs_shape.rank()) ? lhs_shape : rhs_shape;
+  for (int idx = min_rank + 1; idx <= output_shape.rank(); idx++)
+  {
+    int out_idx = output_shape.rank() - idx;
+    int tmp_idx = tmp_shape.rank() - idx;
+
+    OP_REQUIRES((out_idx >= 0) && (tmp_idx >= 0) &&
+                (output_shape.dim(out_idx) == tmp_shape.dim(tmp_idx)));
+  }
+}
+void ShapeValidator::visit(const ir::operation::Tile &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+  const auto multiple_index{node.getInputs().at(1)};
+
+  OP_REQUIRES(_ctx.at(multiple_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(multiple_index).shape().dim(0) == _ctx.at(input_index).shape().rank());
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::Range &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto start_index{node.getInputs().at(ir::operation::Range::Input::START)};
+  const auto limit_index{node.getInputs().at(ir::operation::Range::Input::LIMIT)};
+  const auto delta_index{node.getInputs().at(ir::operation::Range::Input::DELTA)};
+
+  // Check for dimension constraints
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(start_index).shape().rank() == 0);
+  OP_REQUIRES(_ctx.at(limit_index).shape().rank() == 0);
+  OP_REQUIRES(_ctx.at(delta_index).shape().rank() == 0);
+}
+
+void ShapeValidator::visit(const ir::operation::MatrixBandPart &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)};
+  const auto num_lower_index{
+      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)};
+  const auto num_upper_index{
+      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)};
+
+  // Check for dimension constraints
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() >= 2);     // input must be more than 2 dim matrix
+  OP_REQUIRES(_ctx.at(num_upper_index).shape().rank() == 0); // num_lower must be scalar
+  OP_REQUIRES(_ctx.at(num_lower_index).shape().rank() == 0); // num_upper must be scalar
+}
+
+void ShapeValidator::visit(const ir::operation::LogSoftmax &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
+
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.h b/runtime/onert/core/src/compiler/ShapeValidator.h
new file mode 100644
index 000000000..f40c098d5
--- /dev/null
+++ b/runtime/onert/core/src/compiler/ShapeValidator.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_SHAPE_VALIDATOR_H__
+#define __ONERT_COMPILER_SHAPE_VALIDATOR_H__
+
+#include "ir/Layout.h"
+#include "ir/OperationVisitor.h"
+
+namespace onert
+{
+namespace ir
+{
+class Graph;
+class Operands;
+} // namespace ir
+} // namespace onert
+
+namespace onert
+{
+namespace compiler
+{
+
+class ShapeValidator : public ir::OperationVisitor
+{
+public:
+  ShapeValidator(void) = delete;
+  ShapeValidator(const ir::Graph &graph);
+
+public:
+  void operator()();
+
+public:
+  void visit(const ir::operation::BatchMatMul &node) override;
+  void visit(const ir::operation::BatchToSpaceND &node) override;
+  void visit(const ir::operation::BCQFullyConnected &node) override;
+  void visit(const ir::operation::BCQGather &node) override;
+  void visit(const ir::operation::Comparison &node) override;
+  void visit(const ir::operation::Softmax &node) override;
+  void visit(const ir::operation::InstanceNorm &node) override;
+  void visit(const ir::operation::Permute &node) override;
+  void visit(const ir::operation::Pool2D &node) override;
+  void visit(const ir::operation::Reduce &node) override;
+  void visit(const ir::operation::Transpose &node) override;
+  void visit(const ir::operation::RNN &node) override;
+  void visit(const ir::operation::SpaceToBatchND &node) override;
+  void visit(const ir::operation::SpaceToDepth &node) override;
+  void visit(const ir::operation::ElementwiseActivation &node) override;
+  void visit(const ir::operation::ElementwiseBinary &node) override;
+  void visit(const ir::operation::ElementwiseUnary &node) override;
+  void visit(const ir::operation::EmbeddingLookup &node) override;
+  void visit(const ir::operation::ExpandDims &node) override;
+  void visit(const ir::operation::HashtableLookup &node) override;
+  void visit(const ir::operation::TransposeConv &node) override;
+  void visit(const ir::operation::Gather &node) override;
+  void visit(const ir::operation::DepthToSpace &node) override;
+  void visit(const ir::operation::Pack &node) override;
+  void visit(const ir::operation::LSTM &node) override;
+  void visit(const ir::operation::L2Normalization &node) override;
+  void visit(const ir::operation::Unpack &node) override;
+  void visit(const ir::operation::Pad &node) override;
+  void visit(const ir::operation::Select &node) override;
+  void visit(const ir::operation::StridedSlice &node) override;
+  void visit(const ir::operation::Split &node) override;
+  void visit(const ir::operation::Shape &node) override;
+  void visit(const ir::operation::ResizeBilinear &node) override;
+  void visit(const ir::operation::Reverse &node) override;
+  void visit(const ir::operation::If &node) override;
+  void visit(const ir::operation::While &node) override;
+  void visit(const ir::operation::SquaredDifference &node) override;
+  void visit(const ir::operation::Tile &node) override;
+  void visit(const ir::operation::Range &node) override;
+  void visit(const ir::operation::MatrixBandPart &node) override;
+  void visit(const ir::operation::LogSoftmax &node) override;
+
+private:
+  void checkUnaryOp(const ir::Operation &node);
+
+private:
+  // TODO Remove _ctx field
+  const ir::Graph &_graph;
+  const ir::Operands &_ctx;
+  ir::Layout _current_op_seq_layout;
+};
+
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_SHAPE_VALIDATOR_H__
diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
index 4eba1ff49..df129d98b 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
@@ -147,16 +147,26 @@ void StaticShapeInferer::visit(const ir::operation::ArgMax &op)
   const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
+  const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto &axis = _operands.at(axis_idx);
+
   // get mutable output operand
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
-  const auto rank = input.info().shape().rank();
-  const auto axis = ((op.param().axis < 0) ? rank + op.param().axis : op.param().axis);
 
-  assert(0 <= axis && axis < rank);
+  if (!axis.isConstant())
+  {
+    output.info().setDynamic();
+    _return_has_dynamic_tensor = true;
+    return;
+  }
+
+  const auto rank = input.info().shape().rank();
+  auto axis_value = axis.asScalar<int32_t>();
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
   // re-sizing output shape
-  ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis, rank);
+  ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis_value, rank);
   output.info().shape(new_shape);
 }
 
@@ -165,13 +175,60 @@ void StaticShapeInferer::visit(const ir::operation::BatchMatMul &op)
   const auto lhs_index = op.getInputs().at(ir::operation::BatchMatMul::Input::LHS);
   const auto rhs_index = op.getInputs().at(ir::operation::BatchMatMul::Input::RHS);
   const auto output_index = op.getOutputs().at(0);
-  const auto lhs = _operands.at(lhs_index);
-  const auto rhs = _operands.at(rhs_index);
+  const auto &lhs = _operands.at(lhs_index);
+  const auto &rhs = _operands.at(rhs_index);
   auto &output = _operands.at(output_index);
   auto new_shape = shape_inference::inferBatchMatMulShape(lhs.shape(), rhs.shape(), op.param());
   output.info().shape(new_shape);
 }
 
+void StaticShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
+{
+  const auto input_idx{op.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
+  const auto &input = _operands.at(input_idx);
+
+  const auto cluster_idx{
+      op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+  const auto &cluster = _operands.at(cluster_idx);
+
+  const auto output_idx = op.getOutputs().at(0);
+  ir::Operand &output = _operands.at(output_idx);
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster.data()->base());
+  assert(cluster_buf);
+
+  // re-sizing output shape
+  ir::Shape new_shape = shape_inference::inferBCQFullyConnectedShape(
+      input.info().shape(), cluster.info().shape(), cluster_buf);
+  output.info().shape(new_shape);
+}
+
+void StaticShapeInferer::visit(const ir::operation::BCQGather &op)
+{
+  const auto indices_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto &indices = _operands.at(indices_idx);
+
+  const auto input_binary_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)};
+  const auto &input_binary = _operands.at(input_binary_idx);
+
+  const auto cluster_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+  const auto &cluster = _operands.at(cluster_idx);
+
+  const auto output_idx = op.getOutputs().at(0);
+  ir::Operand &output = _operands.at(output_idx);
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster.data()->base());
+  assert(cluster_buf);
+
+  auto rank = input_binary.shape().rank();
+
+  // re-sizing output shape
+  ir::Shape new_shape = shape_inference::inferBCQGatherShape(
+      indices.info().shape(), cluster.info().shape(), cluster_buf, rank, op.param());
+
+  output.info().shape(new_shape);
+}
+
 void StaticShapeInferer::visit(const ir::operation::BinaryArithmetic &op)
 {
   handleBinaryArithmeticOp(op, op.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS),
@@ -439,6 +496,98 @@ void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
 }
 
+void StaticShapeInferer::visit(const ir::operation::LSTM &op)
+{
+  const auto output_index{op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+  auto &output = _operands.at(output_index);
+
+  const auto output_state_out_index{
+      op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+
+  const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+
+  const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+
+  if (output.info().isDynamic() || (_operands.exist(output_state_out_index) &&
+                                    _operands.at(output_state_out_index).info().isDynamic()) ||
+      (_operands.exist(cell_state_out_index) &&
+       _operands.at(cell_state_out_index).info().isDynamic()) ||
+      (_operands.exist(scratch_buffer_index) &&
+       _operands.at(scratch_buffer_index).info().isDynamic()))
+    return;
+
+  const auto input_index{op.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto &input = _operands.at(input_index);
+
+  const auto input_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto &input_to_output_weights = _operands.at(input_to_output_weights_index);
+
+  const auto recurrent_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto &recurrent_to_output_weights = _operands.at(recurrent_to_output_weights_index);
+
+  // re-sizing outputs
+  const int n_batch = (input.shape().rank() == 3 && op.param().time_major) ? input.shape().dim(1)
+                                                                           : input.shape().dim(0);
+  const int n_cell = input_to_output_weights.shape().dim(0);
+  const int n_output = recurrent_to_output_weights.shape().dim(1);
+  if (input.shape().rank() == 3)
+  {
+    if (op.param().time_major)
+      output.info().shape(ir::Shape{input.shape().dim(0), n_batch, n_output});
+    else
+      output.info().shape(ir::Shape{n_batch, input.shape().dim(1), n_output});
+  }
+  else
+  {
+    assert(input.shape().rank() == 2);
+    output.info().shape(ir::Shape{n_batch, n_output});
+  }
+
+  if (_operands.exist(output_state_out_index))
+  {
+    auto &output_state_out = _operands.at(output_state_out_index);
+    output_state_out.info().shape(ir::Shape{n_batch, n_output});
+  }
+
+  if (_operands.exist(cell_state_out_index))
+  {
+    auto &cell_state_out = _operands.at(cell_state_out_index);
+    cell_state_out.info().shape(ir::Shape{n_batch, n_cell});
+  }
+
+  if (_operands.exist(scratch_buffer_index))
+  {
+    auto &scratch_buffer = _operands.at(scratch_buffer_index);
+
+    const auto input_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+    const auto recurrent_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+
+    bool has_input_to_input_weights =
+        _operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+        _operands.at(input_to_input_weights_index).shape().dim(1) != 0;
+    bool has_recurrent_to_input_weights =
+        _operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+        _operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+
+    // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
+    // true: no CIFG
+    // false: CIFG
+    bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+    if (has_cifg_param)
+    {
+      scratch_buffer.info().shape(ir::Shape{n_batch, n_cell * 4});
+    }
+    else
+    {
+      scratch_buffer.info().shape(ir::Shape{n_batch, n_cell * 3});
+    }
+  }
+}
+
 void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
@@ -683,9 +832,29 @@ void StaticShapeInferer::visit(const ir::operation::ResizeBilinear &op)
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
 
+  int32_t height_out, width_out;
+  if (op.getInputs().size() == 2)
+  {
+    auto &size = _operands.at(op.getInputs().at(ir::operation::ResizeBilinear::Input::SIZE));
+    if (!size.isConstant())
+    {
+      output.info().setDynamic();
+      _return_has_dynamic_tensor = true;
+      return;
+    }
+    const auto size_v = size.asVector<std::int32_t>();
+    height_out = size_v[0];
+    width_out = size_v[1];
+  }
+  else
+  {
+    height_out = op.param().height_out;
+    width_out = op.param().width_out;
+  }
+
   // Shape inferencing logic based on Params
-  ir::Shape new_shape = shape_inference::inferResizeBilinearShape(
-      input.shape(), op.param().height_out, op.param().width_out);
+  ir::Shape new_shape =
+      shape_inference::inferResizeBilinearShape(input.shape(), height_out, width_out);
 
   // if size_op is from Const, TFLC put the shape of output into tensor
   if (new_shape != output.shape())
@@ -803,21 +972,35 @@ void StaticShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
 
 void StaticShapeInferer::visit(const ir::operation::Split &op)
 {
-  const auto input_idx{op.getInputs().at(0)};
+  const auto input_idx{op.getInputs().at(ir::operation::Split::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
-  const auto axis = op.param().axis;
+  const auto axis_idx{op.getInputs().at(ir::operation::Split::Input::AXIS)};
+  const auto &axis = _operands.at(axis_idx);
+
+  auto outputs = op.getOutputs();
+  if (!axis.isConstant())
+  {
+    for (auto output_idx : outputs)
+    {
+      ir::Operand &output = _operands.at(output_idx);
+      output.info().setDynamic();
+    }
+    _return_has_dynamic_tensor = true;
+    return;
+  }
+
   const auto num_splits = op.param().num_splits;
 
   const auto rank = input.info().shape().rank();
-  auto axis_resolved = axis < 0 ? axis + rank : axis;
+  auto axis_value = axis.asScalar<int32_t>();
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  assert(0 <= axis_resolved && axis_resolved < rank);
+  assert(0 <= axis_value && axis_value < rank);
 
   ir::Shape new_shape =
-      shape_inference::inferSplitShape(input.info().shape(), axis_resolved, num_splits);
-  auto output_tensors = op.getOutputs();
-  for (auto output_idx : output_tensors)
+      shape_inference::inferSplitShape(input.info().shape(), axis_value, num_splits);
+  for (auto output_idx : outputs)
   {
     ir::Operand &output = _operands.at(output_idx);
     output.info().shape(new_shape);
@@ -838,13 +1021,6 @@ void StaticShapeInferer::visit(const ir::operation::Squeeze &op)
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
 
-  if (input.info().isDynamic())
-  {
-    output.info().setDynamic();
-    _return_has_dynamic_tensor = true;
-    return;
-  }
-
   // Squeeze output shpae
   ir::Shape new_shape = shape_inference::inferSqueezeShape(input.info().shape(), op.param());
   output.info().shape(new_shape);
@@ -909,7 +1085,8 @@ void StaticShapeInferer::visit(const ir::operation::Tile &op)
   assert(multiplier_buffer);
 
   // re-sizing output shape
-  auto new_shape = shape_inference::inferTileShape(input.info().shape(), multiplier_buffer);
+  auto new_shape = shape_inference::inferTileShape(input.info().shape(), multiplier_buffer,
+                                                   multiplier.shape().num_elements());
   output.info().shape(new_shape);
 }
 
@@ -918,14 +1095,43 @@ void StaticShapeInferer::visit(const ir::operation::Transpose &op)
   const auto input_idx{op.getInputs().at(ir::operation::Transpose::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
+  const auto perm_idx{op.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
+  const auto &perm = _operands.at(perm_idx);
+
+  // perm.shape() != ir::Shape{0} means that perm is (n-1...0)
+  // TODO This condition changes to perm.num_elements() == 0
+  const auto is_regular_transpose = perm.shape() == ir::Shape{0};
+
   // get mutable output operand
   const auto output_idx = op.getOutputs().at(0);
-  ir::Operand &output = _operands.at(output_idx);
-  const auto perm{op.param().perm};
-  // const auto rank{op.param().rank};
+  auto &output = _operands.at(output_idx);
+  if (!perm.isConstant() && !is_regular_transpose)
+  {
+    output.info().setDynamic();
+    _return_has_dynamic_tensor = true;
+    return;
+  }
 
-  // set output shape, based on input and params
-  ir::Shape new_shape = shape_inference::inferTransposeShape(input.info().shape(), perm);
+  ir::Shape new_shape;
+  if (is_regular_transpose)
+  {
+    // Call by (n-1...0)
+    new_shape = shape_inference::inferTransposeShape(input.info().shape(), nullptr, 0);
+  }
+  else
+  {
+    // Check rank
+    if (input.info().shape().rank() != static_cast<int>(perm.info().shape().num_elements()))
+    {
+      throw std::runtime_error("StaticShapeInferer failed, bad rank size: " +
+                               std::to_string(perm.info().shape().num_elements()));
+    }
+
+    // set output shape, based on input and params
+    const auto perm_buf = reinterpret_cast<const int32_t *>(perm.data()->base());
+    new_shape = shape_inference::inferTransposeShape(input.info().shape(), perm_buf,
+                                                     perm.shape().num_elements());
+  }
   output.info().shape(new_shape);
 }
 
diff --git a/runtime/onert/core/src/compiler/TensorRegistries.h b/runtime/onert/core/src/compiler/TensorRegistries.h
index 8be87b081..e42225cbf 100644
--- a/runtime/onert/core/src/compiler/TensorRegistries.h
+++ b/runtime/onert/core/src/compiler/TensorRegistries.h
@@ -69,7 +69,7 @@ public:
     return _cf_tensor_reg;
   }
 
-  std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind) const
+  backend::ITensor *getITensor(ir::OperandIndex ind) const
   {
     for (auto &tensor_reg : _tensor_regs)
     {
diff --git a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
index 647669e46..ef6240894 100644
--- a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
+++ b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
@@ -44,7 +44,7 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
       const auto key = ReplaceKey{input, factor};
       if (_replace_operands_map.count(key) == 0)
       {
-        auto new_object = object;
+        ir::Operand new_object(object);
         new_object.unsetDef();
         // TODO Remove const_case
         const_cast<ir::OperationIndexSet &>(new_object.getUses()).clear();
@@ -81,7 +81,7 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
   }
 
   // Now this runtime does not support the node making output as constant
-  for (const auto &output : node.getOutputs())
+  for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
   {
     UNUSED_RELEASE(output);
     assert(!_graph.operands().at(output).isConstant());
diff --git a/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc
new file mode 100644
index 000000000..c176f6ffb
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConstantOutputPass.h"
+
+#include "ir/Graph.h"
+#include "ir/operation/Permute.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+void ConstantOutputPass::callback(const ir::OperandIndex &ind, ir::Operand &obj)
+{
+  if (!_graph.getOutputs().contains(ind) || !obj.isConstant())
+    return;
+
+  auto permute_input_ind = _graph.addOperand(obj.shape(), obj.typeInfo());
+  auto &permute_input_obj = _graph.operands().at(permute_input_ind);
+
+  // Move the const data
+  permute_input_obj.data(obj.shareData());
+  obj.releaseData();
+  obj.info().setAsNonConst();
+
+  using ir::operation::Permute;
+  auto permute_obj = std::make_unique<Permute>(permute_input_ind, ind, Permute::Type::COPY);
+  auto permute_ind = _graph.operations().push(std::move(permute_obj));
+
+  permute_input_obj.insertUse(permute_ind);
+  obj.setDef(permute_ind);
+
+  // Make the operations that uses this operand to use the generated operand
+  auto orig_uses = obj.getUses();
+  for (auto use : orig_uses)
+  {
+    permute_input_obj.insertUse(use);
+    obj.removeUse(use);
+    _graph.operations().at(use).replaceInputs(ind, permute_input_ind);
+  }
+
+  VERBOSE(ConstantOutputPass) << "Permute Op inserted for a constant ouput, node index : "
+                              << permute_ind << std::endl;
+  VERBOSE(ConstantOutputPass) << "  - Input (inserted) Operand : " << permute_input_ind
+                              << std::endl;
+  VERBOSE(ConstantOutputPass) << "  - Output(original) Operand : " << ind << std::endl;
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h
new file mode 100644
index 000000000..193dd3a68
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_PASS_CONSTANT_OUTPUT_PASS_H__
+#define __ONERT_COMPILER_PASS_CONSTANT_OUTPUT_PASS_H__
+
+#include "OperandPass.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief Pass to specially handle constant model outputs
+ *
+ * As an output buffer is given right before an execution but constant initialization is done at
+ * prepare phase, the current runtime structure cannot handle when an output is constant.
+ * To resolve this problem, this pass inserts a Permute layer with a const input and make the model
+ * output tensor to be its output.
+ *
+ * e.g.)
+ *
+ * ((Const Output))
+ *
+ * becomes
+ *
+ * (Const) -> [Permute] -> ((Output))
+ *
+ * Note that this is a mandatory pass for Graph.
+ */
+class ConstantOutputPass : public OperandPass
+{
+public:
+  using OperandPass::OperandPass;
+
+public:
+  std::string id() final { return "ConstantOutputPass"; }
+
+public:
+  void callback(const ir::OperandIndex &i, ir::Operand &o) final;
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_CONSTANT_INSERTION_PASS_H__
diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
new file mode 100644
index 000000000..f50fae0d3
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OddOutputPass.h"
+
+#include "ir/Graph.h"
+#include "ir/operation/Permute.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+void OddOutputPass::run()
+{
+  auto &outputs = _graph.getOutputs();
+
+  VERBOSE(OddOutputPass) << "Case 1 : An operand which is a model output and a model input"
+                         << std::endl;
+  for (auto &ind : outputs)
+  {
+    if (_graph.getInputs().contains(ind))
+    {
+      auto permute_output_ind = insertPermute(ind);
+      // Update the output to be newly added operand
+      _graph.getOutputs().replace(ind, permute_output_ind);
+    }
+  }
+
+  VERBOSE(OddOutputPass) << "Case 2 : Two or more duplicated outputs" << std::endl;
+  std::unordered_set<ir::OperandIndex> occurence;
+  for (auto &ind : outputs)
+  {
+    auto &obj = _graph.operands().at(ind);
+    if (occurence.count(ind) == 0)
+    {
+      occurence.insert(ind);
+      continue;
+    }
+
+    // Panic when it is const, it must have been handled earlier in another pass
+    UNUSED_RELEASE(obj);
+    assert(!obj.isConstant());
+
+    auto permute_output_ind = insertPermute(ind);
+    ind = permute_output_ind; // Replace output index to fix output duplication
+  }
+}
+
+ir::OperandIndex OddOutputPass::insertPermute(ir::OperandIndex ind)
+{
+  auto &obj = _graph.operands().at(ind);
+  auto output_ind = _graph.addOperand(obj.shape(), obj.typeInfo());
+  auto &output_obj = _graph.operands().at(output_ind);
+
+  using ir::operation::Permute;
+  auto permute_obj = std::make_unique<Permute>(ind, output_ind, Permute::Type::COPY);
+  auto permute_ind = _graph.operations().push(std::move(permute_obj));
+
+  output_obj.setDef(permute_ind);
+  obj.insertUse(permute_ind);
+
+  VERBOSE(OddOutputPass) << "Permute Op inserted for a constant output, node index : "
+                         << permute_ind << std::endl;
+  VERBOSE(OddOutputPass) << "  - Input (original) Operand : " << ind << std::endl;
+  VERBOSE(OddOutputPass) << "  - Output(inserted) Operand : " << output_ind << std::endl;
+
+  return output_ind;
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.h b/runtime/onert/core/src/compiler/pass/OddOutputPass.h
new file mode 100644
index 000000000..2accbac60
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__
+#define __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__
+
+#include <unordered_set>
+
+#include "Pass.h"
+#include "ir/Index.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief Pass to specially handle odd outputs in a subgraph
+ *
+ * Runtime Graph IR requires every input or output must have distinct tensor index, this is onert's
+ * restriction. However we allow duplication of indices in the models(or API). So we should
+ * transform the graph after model-loading.
+ *
+ * This is necessary since our API lets users to set different buffers for each input and output so
+ * it is unavoidable that we must copy the value at runtime.
+ *
+ * Note that this is a mandatory pass for Graph.
+ *
+ * Case 1 : An operand which is a model output and a model input
+ *
+ * Create an operand and insert a Permute(copy) op between them. And change the output to be the
+ * newly generated operand.
+ *
+ * e.g.)
+ *
+ * ```
+ * ((#0 Input0 and also Output0))
+ * becomes
+ * ((#0 Input0)) -> [#0 Permute] -> ((#1 Output0))
+ * ```
+ *
+ * Case 2 : Two or more duplicated outputs
+ *
+ * Do the same with Case 1, but between two outputs of the same tensor index.
+ *
+ * e.g.)
+ *
+ * ```
+ * ((#0 Input0)) -> [#0 Some Operation] -> ((#1 Output0 and also Output1))
+ * becomes
+ * ((#0 Input0)) -> [#0 Some Operation] -> ((#1 Output0)) [#1 Permute] -> ((#2 Output1))
+ * ```
+ *
+ */
+class OddOutputPass : public Pass
+{
+public:
+  using Pass::Pass;
+
+public:
+  std::string id() final { return "OddOutputPass"; }
+
+public:
+  void run() override;
+
+private:
+  ir::OperandIndex insertPermute(ir::OperandIndex input);
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__
diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.cc b/runtime/onert/core/src/compiler/pass/PassRunner.cc
new file mode 100644
index 000000000..2a058c8ac
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/PassRunner.cc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PassRunner.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+PassRunner &PassRunner::append(std::unique_ptr<Pass> pass)
+{
+  _passes.emplace_back(std::move(pass));
+  return *this;
+}
+
+void PassRunner::run()
+{
+  for (auto &pass : _passes)
+  {
+    VERBOSE(PassRunner) << "Start running '" << pass->id() << "'" << std::endl;
+    pass->run();
+    VERBOSE(PassRunner) << "Finished running '" << pass->id() << "'" << std::endl;
+    // TODO Dump graph(LowerInfo, OpSequence, ...)?
+  }
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.h b/runtime/onert/core/src/compiler/pass/PassRunner.h
new file mode 100644
index 000000000..a43c83f89
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/PassRunner.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_PASS_PASS_RUNNER_H__
+#define __ONERT_COMPILER_PASS_PASS_RUNNER_H__
+
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include "Pass.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief Composite passes with logging
+ */
+class PassRunner
+{
+public:
+  PassRunner() = default;
+  PassRunner &append(std::unique_ptr<Pass> pass);
+
+  void run();
+
+private:
+  std::vector<std::unique_ptr<Pass>> _passes;
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_PASS_RUNNER_H__
diff --git a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
index f01697034..504f1b995 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
@@ -53,6 +53,20 @@ void PermutationEliminationPass::visit(const ir::operation::Permute &node)
 
   if (_graph.getOutputs().contains(out_operand))
   {
+    // If the input is a const, we cannot remove it since we cannot put the constant data in the
+    // output buffer during prepare phase.
+    auto permute_input = node.getInputs().at(0);
+    if (_graph.operands().at(permute_input).isConstant())
+      return;
+    // If the input is a model input, we cannot remove it since our API lets users to set different
+    // buffers for inputs and outputs even though one tensor is both at the same time.
+    auto permute_output = node.getOutputs().at(0);
+    if (_graph.getInputs().contains(permute_input) && _graph.getOutputs().contains(permute_output))
+      return;
+    // Likewise, if copying between outputs to outputs, keep it.
+    if (_graph.getOutputs().contains(permute_input) && _graph.getOutputs().contains(permute_output))
+      return;
+
     // Exceptional case : When the output operand is a model output
     // In this case we keep the output and remove the input
 
diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
index c5c95c726..93d125307 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
@@ -212,7 +212,7 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
       }
     }
 
-    for (const auto &output : node.getOutputs() | Remove::DUPLICATED)
+    for (const auto &output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       auto lower_info = _lowered_graph.getLowerInfo(output);
       lower_info->removeDefPermuteFactor(removed_factor);
@@ -279,6 +279,18 @@ void PermutationOperationPass::visit(const ir::operation::Gather &node)
   }
 }
 
+void PermutationOperationPass::visit(const ir::operation::OneHot &node)
+{
+  const auto &output_ind = node.getOutputs().at(0);
+  const auto &output_obj = _graph.operands().at(output_ind);
+  const auto &output_shape = output_obj.shape();
+
+  if (output_shape.rank() >= 4)
+  {
+    changeToKeepLayout(node);
+  }
+}
+
 void PermutationOperationPass::visit(const ir::operation::Pack &node)
 {
   const auto &input_ind = node.getInputs().at(ir::operation::Reshape::Input::INPUT);
diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h
index 2dd76b971..cea5de288 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h
+++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h
@@ -44,6 +44,7 @@ public:
   void visit(const ir::operation::Concat &) final;
   void visit(const ir::operation::ElementwiseBinary &) final;
   void visit(const ir::operation::ElementwiseUnary &) final;
+  void visit(const ir::operation::OneHot &) final;
   void visit(const ir::operation::Pack &) final;
   void visit(const ir::operation::PReLU &) final;
   void visit(const ir::operation::SquaredDifference &) final;
diff --git a/runtime/onert/core/src/dumper/dot/DotDumper.cc b/runtime/onert/core/src/dumper/dot/DotDumper.cc
index 118057f09..8f3cf328c 100644
--- a/runtime/onert/core/src/dumper/dot/DotDumper.cc
+++ b/runtime/onert/core/src/dumper/dot/DotDumper.cc
@@ -81,11 +81,8 @@ void DotDumper::dump(const std::string &tag)
     }
     else
     {
-      showing_cond = !object.isConstant();
-    }
-    if (object.isConstant() || _graph.getInputs().contains(index))
-    {
-      showing_cond = showing_cond && (object.getUses().size() > 0);
+      showing_cond =
+          !object.isConstant() || (_graph.getInputs() + _graph.getOutputs()).contains(index);
     }
     if (showing_cond)
     {
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.cc b/runtime/onert/core/src/exec/DataflowExecutor.cc
index a69ae9cdb..53bc3c204 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.cc
+++ b/runtime/onert/core/src/exec/DataflowExecutor.cc
@@ -77,14 +77,12 @@ bool DataflowExecutor::noWaitingJobs()
                      [](const std::unique_ptr<Job> &job) { return job == nullptr; });
 }
 
-DataflowExecutor::DataflowExecutor(
-    std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-    const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-    const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-    const compiler::TensorRegistries &tensor_regs, backend::TensorManagerSet &&tensor_mgrs,
-    compiler::CodeMap &&code_map)
-    : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
-                   std::move(tensor_mgrs)},
+DataflowExecutor::DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                   const std::vector<backend::ITensor *> &input_tensors,
+                                   const std::vector<backend::ITensor *> &output_tensors,
+                                   const compiler::TensorRegistries &tensor_regs,
+                                   compiler::CodeMap &&code_map)
+    : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs},
       _code_map{std::move(code_map)}
 {
   VERBOSE(DataflowExecutor) << "Constructing Dataflow Executor" << std::endl;
@@ -161,6 +159,8 @@ void DataflowExecutor::executeImpl()
 
     _subject.notifyJobBegin(this, op_seq, backend);
 
+    job->fn_seq()->initRunning();
+
     // check if FunctionSequence needs to handle dynamic tensor
     bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists;
     job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor);
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.h b/runtime/onert/core/src/exec/DataflowExecutor.h
index 8d60e3e4b..69dfda15c 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.h
+++ b/runtime/onert/core/src/exec/DataflowExecutor.h
@@ -50,10 +50,9 @@ public:
    * @param code_map OpSequence and its code map
    */
   DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                   const compiler::TensorRegistries &tensor_regs,
-                   backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map);
+                   const std::vector<backend::ITensor *> &input_tensors,
+                   const std::vector<backend::ITensor *> &output_tensors,
+                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map);
 
   void executeImpl() override;
 
diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
index 70bddfce4..0f604c43f 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
@@ -23,14 +23,6 @@ namespace onert
 namespace exec
 {
 
-inline backend::IDynamicTensorManager *
-dynamicTensorManagerOf(const std::shared_ptr<backend::ITensor> &tensor)
-{
-  if (!tensor->dynamic_tensor_manager())
-    throw std::runtime_error{"Dynamic Tensor Manager is not available for this tensor."};
-  return tensor->dynamic_tensor_manager();
-}
-
 void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
                                                    const ir::OperandIndex lhs_idx,
                                                    const ir::OperandIndex rhs_idx)
@@ -64,7 +56,7 @@ void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
 
   ir::Shape new_shape = shape_inference::inferEltwiseShape(lhs_shape, rhs_shape);
 
-  dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -96,30 +88,32 @@ void DynamicShapeInferer::handleSimpleUnaryOp(const ir::Operation &op,
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::ArgMax &op)
 {
   const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
-  const auto &input = _tensor_registry->getITensor(input_idx);
-  auto input_shape = input->getShape();
+  const auto input = _tensor_registry->getITensor(input_idx);
+
+  const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto axis = _tensor_registry->getITensor(axis_idx);
+
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
 
   if (!input->is_dynamic())
     return;
 
+  auto input_shape = input->getShape();
+  auto axis_value = *reinterpret_cast<const int32_t *>(axis->buffer());
   const auto rank = input_shape.rank();
-  const auto axis = ((op.param().axis < 0) ? rank + op.param().axis : op.param().axis);
-
-  assert(0 <= axis && axis < rank);
-
-  auto output_ind = op.getOutputs().at(0);
-  auto output = _tensor_registry->getITensor(output_ind);
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis, rank);
+  ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis_value, rank);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -141,7 +135,68 @@ void DynamicShapeInferer::visit(const ir::operation::BatchMatMul &op)
   // TODO
 
   auto new_shape = shape_inference::inferBatchMatMulShape(lhs_shape, rhs_shape, op.param());
-  dynamicTensorManagerOf(output)->applyShape(output_index, new_shape);
+  output->applyShape(new_shape);
+}
+
+void DynamicShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
+{
+  const auto input_idx{op.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
+  const auto &input = _tensor_registry->getITensor(input_idx);
+
+  const auto cluster_idx{
+      op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+  const auto &cluster = _tensor_registry->getITensor(cluster_idx);
+  assert(cluster->is_constant());
+
+  if (!input->is_dynamic())
+    return;
+
+  auto input_shape = input->getShape();
+  auto cluster_shape = cluster->getShape();
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster->buffer());
+  assert(cluster_buf);
+
+  ir::Shape new_shape =
+      shape_inference::inferBCQFullyConnectedShape(input_shape, cluster_shape, cluster_buf);
+
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
+
+  output->applyShape(new_shape);
+  assert(output->buffer() != nullptr);
+}
+
+void DynamicShapeInferer::visit(const ir::operation::BCQGather &op)
+{
+  const auto indices_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto &indices = _tensor_registry->getITensor(indices_idx);
+
+  const auto input_binary_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto &input_binary = _tensor_registry->getITensor(input_binary_idx);
+
+  const auto cluster_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+  const auto &cluster = _tensor_registry->getITensor(cluster_idx);
+  assert(cluster->is_constant());
+
+  if (!indices->is_dynamic())
+    return;
+
+  auto indices_shape = indices->getShape();
+  auto cluster_shape = cluster->getShape();
+  auto rank = input_binary->getShape().rank();
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster->buffer());
+  assert(cluster_buf);
+
+  ir::Shape new_shape = shape_inference::inferBCQGatherShape(indices_shape, cluster_shape,
+                                                             cluster_buf, rank, op.param());
+
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
+
+  output->applyShape(new_shape);
+  assert(output->buffer() != nullptr);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::BinaryArithmetic &op)
@@ -170,7 +225,7 @@ void DynamicShapeInferer::visit(const ir::operation::BroadcastTo &op)
       shape->getShape(), reinterpret_cast<const int32_t *>(shape->buffer()));
 
   // set output shape and output buffer
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -236,7 +291,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op)
     for (auto input_ind : op.getInputs())
     {
       auto input = _tensor_registry->getITensor(input_ind);
-      if (input != first_input && !isConcatible(first_input.get(), input.get(), op.param().axis))
+      if (input != first_input && !isConcatible(first_input, input, op.param().axis))
         throw std::runtime_error("input shapes does not matched for concat");
     }
   }
@@ -255,7 +310,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op)
   auto output = _tensor_registry->getITensor(output_ind);
   auto output_shape = shape_inference::inferConcatShape(in_shapes, op.param());
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::Conv2D &op)
@@ -278,7 +333,7 @@ void DynamicShapeInferer::visit(const ir::operation::Conv2D &op)
 
   ir::Shape output_shape = shape_inference::inferConv2DShape(input_shape, ker_shape, op.param());
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -338,7 +393,7 @@ void DynamicShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_buf[0]);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -354,14 +409,14 @@ void DynamicShapeInferer::visit(const ir::operation::Fill &op)
   if ((!input->is_dynamic()) && (!output->is_dynamic()))
     return;
 
-  assert(input.get()->data_type() == ir::DataType::INT32);
+  assert(input->data_type() == ir::DataType::INT32);
 
   auto input_buf = reinterpret_cast<const int32_t *>(input->buffer());
   assert(input_buf);
 
   auto output_shape = shape_inference::inferFillShape(input_shape, input_buf);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -384,7 +439,7 @@ void DynamicShapeInferer::visit(const ir::operation::FullyConnected &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -416,7 +471,7 @@ void DynamicShapeInferer::visit(const ir::operation::Gather &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -425,6 +480,109 @@ void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
 }
 
+void DynamicShapeInferer::visit(const ir::operation::LSTM &op)
+{
+  const auto output_index{op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+  auto output = _tensor_registry->getITensor(output_index);
+
+  const auto output_state_out_index{
+      op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+
+  const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+
+  const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+
+  if (!output->is_dynamic() &&
+      !(_tensor_registry->getITensor(output_state_out_index) != nullptr &&
+        _tensor_registry->getITensor(output_state_out_index)->is_dynamic()) &&
+      !(_tensor_registry->getITensor(cell_state_out_index) != nullptr &&
+        _tensor_registry->getITensor(cell_state_out_index)->is_dynamic()) &&
+      !(_tensor_registry->getITensor(scratch_buffer_index) != nullptr &&
+        _tensor_registry->getITensor(cell_state_out_index)->is_dynamic()))
+    return;
+
+  const auto input_index{op.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto input = _tensor_registry->getITensor(input_index);
+  const auto input_shape = input->getShape();
+
+  const auto input_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto input_to_output_weights = _tensor_registry->getITensor(input_to_output_weights_index);
+  const auto input_to_output_weights_shape = input_to_output_weights->getShape();
+
+  const auto recurrent_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_output_weights =
+      _tensor_registry->getITensor(recurrent_to_output_weights_index);
+  const auto recurrent_to_output_weights_shape = recurrent_to_output_weights->getShape();
+
+  // re-sizing outputs
+  const int n_batch =
+      (input_shape.rank() == 3 && op.param().time_major) ? input_shape.dim(1) : input_shape.dim(0);
+  const int n_cell = input_to_output_weights_shape.dim(0);
+  const int n_output = recurrent_to_output_weights_shape.dim(1);
+  if (input_shape.rank() == 3)
+  {
+    if (op.param().time_major)
+      output->applyShape(ir::Shape{input_shape.dim(0), n_batch, n_output});
+    else
+      output->applyShape(ir::Shape{n_batch, input_shape.dim(1), n_output});
+  }
+  else
+  {
+    assert(input_shape.rank() == 2);
+    output->applyShape(ir::Shape{n_batch, n_output});
+  }
+  assert(output->buffer() != nullptr);
+
+  auto output_state_out = _tensor_registry->getITensor(output_state_out_index);
+  if (output_state_out != nullptr)
+  {
+    output_state_out->applyShape(ir::Shape{n_batch, n_output});
+    assert(output_state_out->buffer() != nullptr);
+  }
+
+  auto cell_state_out = _tensor_registry->getITensor(cell_state_out_index);
+  if (cell_state_out != nullptr)
+  {
+    cell_state_out->applyShape(ir::Shape{n_batch, n_cell});
+    assert(cell_state_out->buffer() != nullptr);
+  }
+
+  auto scratch_buffer = _tensor_registry->getITensor(scratch_buffer_index);
+  if (scratch_buffer != nullptr)
+  {
+    const auto input_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+    const auto recurrent_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+
+    const auto input_to_input_weights_shape =
+        _tensor_registry->getITensor(input_to_input_weights_index)->getShape();
+    bool has_input_to_input_weights =
+        input_to_input_weights_shape.dim(0) != 0 && input_to_input_weights_shape.dim(1) != 0;
+
+    const auto recurrent_to_input_weights_shape =
+        _tensor_registry->getITensor(recurrent_to_input_weights_index)->getShape();
+    bool has_recurrent_to_input_weights = recurrent_to_input_weights_shape.dim(0) != 0 &&
+                                          recurrent_to_input_weights_shape.dim(1) != 0;
+
+    // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
+    // true: no CIFG
+    // false: CIFG
+    bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+    if (has_cifg_param)
+    {
+      scratch_buffer->applyShape(ir::Shape{n_batch, n_cell * 4});
+    }
+    else
+    {
+      scratch_buffer->applyShape(ir::Shape{n_batch, n_cell * 3});
+    }
+    assert(scratch_buffer->buffer() != nullptr);
+  }
+}
+
 void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
@@ -452,7 +610,7 @@ void DynamicShapeInferer::visit(const ir::operation::OneHot &op)
   const auto axis_val = op.param().axis;
 
   ir::Shape new_shape = shape_inference::inferOnehotShape(indices_shape, *depth_buf, axis_val);
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -488,7 +646,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pack &op)
 
   ir::Shape new_shape = shape_inference::inferPackShape(input_shape, axis, rank, num);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -515,7 +673,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pad &op)
       shape_inference::inferPadShape(input->getShape(), pad_buf, pad->getShape().num_elements());
 
   // change output shape and reallocate output tensor memory
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -567,7 +725,7 @@ void DynamicShapeInferer::visit(const ir::operation::Range &op)
         *reinterpret_cast<int32_t *>(limit_tensor->buffer()),
         *reinterpret_cast<int32_t *>(delta_tensor->buffer()));
   }
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -611,7 +769,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reduce &op)
 
   ir::Shape new_shape = shape_inference::inferReduceShape(input_shape, axes_vec, keep_dims);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -665,7 +823,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     if (output_shape != output->getShape() || output->buffer() == nullptr)
     {
       // change on output shape
-      dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+      output->applyShape(output_shape);
     }
     assert(output->buffer() != nullptr);
   }
@@ -681,7 +839,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     if (output_shape != output->getShape() || output->buffer() == nullptr)
     {
       // change on output shape
-      dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+      output->applyShape(output_shape);
     }
     assert(output->buffer() != nullptr);
   }
@@ -705,14 +863,35 @@ void DynamicShapeInferer::visit(const ir::operation::ResizeBilinear &op)
     return;
 
   // getting output shape from input shape and Params
-  auto output_shape = shape_inference::inferResizeBilinearShape(
-      input->getShape(), op.param().height_out, op.param().width_out);
+  int32_t height_out, width_out;
+  if (op.getInputs().size() == 2)
+  {
+    auto size_ind = op.getInputs().at(ir::operation::ResizeBilinear::Input::SIZE);
+    auto size = _tensor_registry->getITensor(size_ind);
+    if (size->data_type() == ir::DataType::INT32)
+    {
+      auto size_buf = reinterpret_cast<const int32_t *>(size->buffer());
+      height_out = size_buf[0];
+      width_out = size_buf[1];
+    }
+    else
+    {
+      throw std::runtime_error("DynamicShapeInferer ResizeBilinear : Unsupported data type");
+    }
+  }
+  else
+  {
+    height_out = op.param().height_out;
+    width_out = op.param().width_out;
+  }
+  auto output_shape =
+      shape_inference::inferResizeBilinearShape(input->getShape(), height_out, width_out);
 
   // if shape is changed, change output shape and reallocate output tensor memory
   if (output_shape != output->getShape() || output->buffer() == nullptr)
   {
     // change on output shape
-    dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+    output->applyShape(output_shape);
   }
   assert(output->buffer() != nullptr);
 }
@@ -749,7 +928,7 @@ void DynamicShapeInferer::visit(const ir::operation::Select &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -768,7 +947,7 @@ void DynamicShapeInferer::visit(const ir::operation::Shape &op)
   ir::Shape output_shape;
   output_shape.append(input_shape.rank());
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -794,7 +973,7 @@ void DynamicShapeInferer::visit(const ir::operation::Slice &op)
 
   ir::Shape new_shape = shape_inference::inferSliceShape(input_shape, begins_buf, sizes_buf);
 
-  dynamicTensorManagerOf(output)->applyShape(output_index, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -831,7 +1010,7 @@ void DynamicShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
   ir::Shape new_shape = shape_inference::inferSpaceToBatchNDShape(
       input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
 
-  dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -840,27 +1019,37 @@ void DynamicShapeInferer::visit(const ir::operation::Split &op)
   const auto input_idx{op.getInputs().at(ir::operation::Split::Input::INPUT)};
   const auto &input = _tensor_registry->getITensor(input_idx);
 
-  if (!input->is_dynamic())
+  // Return if all tensors are not dynamic
+  bool has_dynamic = false;
+  for (const auto output_idx : op.getOutputs())
+  {
+    auto output = _tensor_registry->getITensor(output_idx);
+    has_dynamic |= output->is_dynamic();
+  }
+  if (!input->is_dynamic() && !has_dynamic)
   {
     return;
   }
 
   auto input_shape = input->getShape();
 
-  const auto axis = op.param().axis;
+  const auto axis_idx{op.getInputs().at(ir::operation::Split::Input::AXIS)};
+  const auto &axis = _tensor_registry->getITensor(axis_idx);
+
+  auto axis_value = *reinterpret_cast<const int32_t *>(axis->buffer());
   const auto num_splits = op.param().num_splits;
   const auto rank = input_shape.rank();
-  auto axis_resolved = axis < 0 ? axis + rank : axis;
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  assert(0 <= axis_resolved && axis_resolved < rank);
+  assert(0 <= axis_value && axis_value < rank);
 
-  ir::Shape new_shape = shape_inference::inferSplitShape(input_shape, axis_resolved, num_splits);
+  ir::Shape new_shape = shape_inference::inferSplitShape(input_shape, axis_value, num_splits);
   for (int out_tensor_idx = 0; out_tensor_idx < num_splits; out_tensor_idx++)
   {
     auto output_ind = op.getOutputs().at(out_tensor_idx);
     auto output = _tensor_registry->getITensor(output_ind);
 
-    dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+    output->applyShape(new_shape);
     assert(output->buffer() != nullptr);
   }
 }
@@ -889,7 +1078,7 @@ void DynamicShapeInferer::visit(const ir::operation::Squeeze &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -930,7 +1119,7 @@ void DynamicShapeInferer::visit(const ir::operation::StridedSlice &op)
   ir::Shape output_shape =
       onert::shape_inference::inferStridedSliceShape(input_shape, op_params, rank);
 
-  dynamicTensorManagerOf(output)->applyShape(output_index, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -952,10 +1141,11 @@ void DynamicShapeInferer::visit(const ir::operation::Tile &op)
   auto multiplier_buffer = reinterpret_cast<const int32_t *>(multiplier->buffer());
   assert(multiplier_buffer);
 
-  auto output_shape = shape_inference::inferTileShape(input_shape, multiplier_buffer);
+  auto output_shape =
+      shape_inference::inferTileShape(input_shape, multiplier_buffer, multiplier->dimension(0));
 
   // set output shape and output buffer
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -967,17 +1157,48 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op)
 
   // from op, access the buffer of second input to read new shape
   auto input_ind = op.getInputs().at(ir::operation::Transpose::Input::INPUT);
-  auto input_tensor = _tensor_registry->getITensor(input_ind);
-  auto input_shape = input_tensor->getShape();
+  auto input = _tensor_registry->getITensor(input_ind);
+  auto input_shape = input->getShape();
 
-  if (!input_tensor->is_dynamic())
+  /*
+    Here, the state after compilation (static shape inference) could be one of the following:
+
+              input       perms             output     execution-time shape inf required
+              ------------------------------------     --------------------------------
+      case 1) static         const          static       X
+      case 2) static       non-const        dynamic      O
+      case 3) dynamic        const          dynamic      O
+      case 4) dynamic      non-const        dynamic      O
+
+    So, only when both input1 and ouput are static, we can skip dynamic shape inference.
+  */
+  if ((!input->is_dynamic()) && (!output->is_dynamic()))
     return;
 
-  const auto perm{op.param().perm};
-  // set output shape, based on input and params
-  ir::Shape new_shape = shape_inference::inferTransposeShape(input_shape, perm);
+  auto perm_ind = op.getInputs().at(ir::operation::Transpose::Input::PERMUTATION);
+  auto perm = _tensor_registry->getITensor(perm_ind);
+
+  ir::Shape new_shape;
+  // TODO Change perm->dimension(0) == 0 to perm->num_elements() == 0
+  if (perm->dimension(0) == 0) // This condition means that perm is (n-1...0)
+  {
+    // Call by (n-1...0)
+    new_shape = shape_inference::inferTransposeShape(input_shape, nullptr, 0);
+  }
+  else
+  {
+    // Check rank
+    if (input->num_dimensions() != perm->getShape().num_elements())
+    {
+      throw std::runtime_error("DynamicShapeInferer failed, bad rank size: " +
+                               std::to_string(perm->getShape().num_elements()));
+    }
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+    // set output shape, based on input and params
+    const auto perm_buffer = reinterpret_cast<const int32_t *>(perm->buffer());
+    new_shape = shape_inference::inferTransposeShape(input_shape, perm_buffer, perm->dimension(0));
+  }
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -1005,7 +1226,7 @@ void DynamicShapeInferer::visit(const ir::operation::Unpack &op)
     auto output_ind = op.getOutputs().at(out_tensor_idx);
     auto output = _tensor_registry->getITensor(output_ind);
 
-    dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+    output->applyShape(new_shape);
 
     assert(output->buffer() != nullptr);
   }
diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc
index 7feb3ab68..21fdd9c05 100644
--- a/runtime/onert/core/src/exec/Execution.cc
+++ b/runtime/onert/core/src/exec/Execution.cc
@@ -34,14 +34,13 @@ Execution::Execution(const std::shared_ptr<ExecutorMap> &executors) : _executors
 
 void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_shape)
 {
-  // This should be called BEFORE setInput.
-  if (_io_desc.inputs.at(index.value()) != 0)
-    throw std::runtime_error("Error in calling order");
-
   // This will be used later to set input tensor dynamic
   // Note that 'compiled' model will not be updated with new_shape
   // but new_shape will change model input shape while 'running' the model
   _io_desc.dynamic_input_shapes[index] = new_shape;
+
+  VERBOSE(Execution) << "Model input shape will be changed at the start of execute()"
+                     << "(index: " << index.value() << ")" << std::endl;
 }
 
 // TODO Remove default parameter
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.cc b/runtime/onert/core/src/exec/ExecutionObservers.cc
index 060f874de..5883d9a1c 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservers.cc
@@ -22,6 +22,7 @@
 #include "exec/IExecutor.h"
 #include "misc/polymorphic_downcast.h"
 #include "ir/OpSequence.h"
+#include "util/EventWriter.h"
 
 namespace onert
 {
@@ -70,7 +71,7 @@ void ProfileObserver::handleEnd(IExecutor *exec, const ir::OpSequence *op_seq,
 };
 
 ChromeTracingObserver::ChromeTracingObserver(const std::string &filepath, const ir::Graph &graph)
-    : _ofs{filepath, std::ofstream::out}, _recorder{}, _collector{&_recorder}, _graph{graph}
+    : _base_filepath(filepath), _recorder{}, _collector{&_recorder}, _graph{graph}
 {
 }
 
@@ -78,7 +79,7 @@ ChromeTracingObserver::~ChromeTracingObserver()
 {
   try
   {
-    _recorder.writeToFile(_ofs);
+    EventWriter{_recorder}.writeToFiles(_base_filepath);
   }
   catch (const std::exception &e)
   {
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h
index ac0076ed2..f8c2acca5 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.h
+++ b/runtime/onert/core/src/exec/ExecutionObservers.h
@@ -76,7 +76,7 @@ private:
   static std::string opSequenceTag(const ir::OpSequence *op_seq, const ir::Operations &operations);
 
 private:
-  std::ofstream _ofs;
+  const std::string &_base_filepath;
   EventRecorder _recorder;
   EventCollector _collector;
   const ir::Graph &_graph;
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index f835a9675..018a0bba0 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -27,38 +27,32 @@ namespace exec
 {
 
 ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
-                           const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                           const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                           const compiler::TensorRegistries &tensor_regs,
-                           backend::TensorManagerSet &&tensor_mgrs)
+                           const std::vector<backend::ITensor *> &input_tensors,
+                           const std::vector<backend::ITensor *> &output_tensors,
+                           const compiler::TensorRegistries &tensor_regs)
     : _lowered_graph{std::move(lowered_graph)}, _graph{_lowered_graph->graph()},
-      _input_tensors{input_tensors}, _output_tensors{output_tensors},
-      _tensor_mgrs{std::move(tensor_mgrs)}, _mutex()
+      _input_tensors{input_tensors}, _output_tensors{output_tensors}, _mutex()
 {
   // TODO Fix the way of knowing whether it is primary or not
   bool primary_executor = !(_input_tensors.empty() && _output_tensors.empty());
   if (!primary_executor)
   {
     auto build_input_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) {
-      std::vector<std::shared_ptr<backend::ITensor>> list;
+      std::vector<backend::ITensor *> list;
       for (auto ind : ind_seq)
       {
-        std::shared_ptr<backend::ITensor> tensor = tensor_regs.getITensor(ind);
+        backend::ITensor *tensor = tensor_regs.getITensor(ind);
         assert(tensor != nullptr);
-        DynAllocInfo dyn_alloc_info{ind};
-        _input_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
         list.push_back(tensor);
       }
       return list;
     };
     auto build_output_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) {
-      std::vector<std::shared_ptr<backend::ITensor>> list;
+      std::vector<backend::ITensor *> list;
       for (auto ind : ind_seq)
       {
-        std::shared_ptr<backend::ITensor> tensor = tensor_regs.getITensor(ind);
+        backend::ITensor *tensor = tensor_regs.getITensor(ind);
         assert(tensor != nullptr);
-        DynAllocInfo dyn_alloc_info{ind};
-        _output_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
         list.push_back(tensor);
       }
       return list;
@@ -66,28 +60,9 @@ ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_gra
     _input_tensors = build_input_tensor_list(_graph.getInputs());
     _output_tensors = build_output_tensor_list(_graph.getOutputs());
   }
-  else
-  {
-    assert(input_tensors.size() == _graph.getInputs().size());
-    assert(output_tensors.size() == _graph.getOutputs().size());
-    for (uint32_t i = 0; i < input_tensors.size(); i++)
-    {
-      auto tensor = input_tensors[i];
-      auto ind = _graph.getInputs().at(i);
-      DynAllocInfo dyn_alloc_info{ind};
-      _input_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
-    }
-    for (uint32_t i = 0; i < output_tensors.size(); i++)
-    {
-      auto tensor = output_tensors[i];
-      auto ind = _graph.getOutputs().at(i);
-      DynAllocInfo dyn_alloc_info{ind};
-      _output_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
-    }
-  }
 }
 
-void ExecutorBase::execute(const std::vector<std::shared_ptr<backend::ITensor>> &src_tensors,
+void ExecutorBase::execute(const std::vector<backend::ITensor *> &src_tensors,
                            const std::shared_ptr<IPermuteFunction> &pre_fn)
 {
   // For thread-safe, use mutex
@@ -108,22 +83,12 @@ void ExecutorBase::execute(const std::vector<std::shared_ptr<backend::ITensor>>
     // If src_tensor or input_tensor is nullptr, pre_fn does not copy the tensors
     if (src_tensor != nullptr && input_tensor != nullptr)
     {
-      auto dyn_alloc_info = _input_to_dyn_alloc_info.find(_input_tensors[n]);
       const auto orig_input_shape = input_tensor->getShape();
       const auto changed_input_shape =
           convertShape(src_tensor->getShape(), src_tensor->layout(), input_tensor->layout());
       if (orig_input_shape != changed_input_shape)
       {
-        if (dyn_alloc_info == _input_to_dyn_alloc_info.end())
-        {
-          // The input_tensor is a dynamic tensor of backend that doesn't support dynamic tensor
-          throw std::runtime_error("Unknown dim is found at execution time for a backend that "
-                                   "does not support dynamic tensor");
-        }
-        else
-        {
-          input_tensor->set_dynamic();
-        }
+        input_tensor->set_dynamic();
       }
     }
   }
@@ -147,7 +112,7 @@ void ExecutorBase::execute(const IODescription &desc)
   for (uint32_t i = 0; i < _input_tensors.size(); ++i)
   {
     // TODO Remove dynamic_cast
-    auto tensor = std::dynamic_pointer_cast<backend::controlflow::UserTensor>(_input_tensors[i]);
+    auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_input_tensors[i]);
     assert(tensor);
     auto input_shape = desc.dynamic_input_shapes.find(ir::IOIndex{i});
     if (input_shape != desc.dynamic_input_shapes.end())
@@ -155,6 +120,7 @@ void ExecutorBase::execute(const IODescription &desc)
       tensor->set_dynamic();
       tensor->setShape(input_shape->second);
     }
+    // TODO Check if (desc.inputs[i] == nullptr)
     // TODO Better design for ITensor? (we need const_cast as ITensor is writable)
     tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.inputs[i]->buffer)),
                       desc.inputs[i]->size);
@@ -166,12 +132,12 @@ void ExecutorBase::execute(const IODescription &desc)
   for (uint32_t i = 0; i < _output_tensors.size(); ++i)
   {
     // TODO Remove dynamic_cast
-    auto tensor = std::dynamic_pointer_cast<backend::controlflow::UserTensor>(_output_tensors[i]);
+    auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_output_tensors[i]);
     assert(tensor);
     tensor->set_dynamic(); // It can't be resized but shape could change
-    // TODO Better design for ITensor? (we need const_cast as ITensor is writable)
-    tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.outputs[i]->buffer)),
-                      desc.outputs[i]->size);
+    if (desc.outputs[i] == nullptr)
+      throw std::runtime_error{"Output " + std::to_string(i) + "'s buffer is not set."};
+    tensor->setBuffer(static_cast<uint8_t *>(desc.outputs[i]->buffer), desc.outputs[i]->size);
   }
 
   executeImpl();
@@ -218,17 +184,8 @@ void ExecutorBase::handleDynamicInputTensor(ir::IOIndex io_ind, const IODescript
   auto shape_sig_found = desc.dynamic_input_shapes.find(io_ind);
   if (shape_sig_found != desc.dynamic_input_shapes.end())
   {
-    auto dyn_alloc_info = _input_to_dyn_alloc_info.find(_input_tensors[io_ind.value()]);
-    if (dyn_alloc_info == _input_to_dyn_alloc_info.end())
-      throw std::runtime_error("Unknown dim is found at execution time for a backend that "
-                               "does not support dynamic tensor");
-
     auto changed_input_shape = shape_sig_found->second;
-    auto operand_ind = dyn_alloc_info->second.ind;
-
-    auto dyn_tensor_manager = _input_tensors[io_ind.value()]->dynamic_tensor_manager();
-    assert(dyn_tensor_manager);
-    dyn_tensor_manager->applyShape(operand_ind, changed_input_shape);
+    _input_tensors[io_ind.value()]->applyShape(changed_input_shape);
   }
 }
 
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index a13be7dbf..8a6ec9174 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -20,9 +20,7 @@
 #include <mutex>
 
 #include "IPermuteFunction.h"
-#include "Source.h"
 #include "exec/ExecutionObservers.h"
-#include "Sink.h"
 #include "ShapeConverter.h"
 #include "exec/IExecutor.h"
 #include "compiler/LoweredGraph.h"
@@ -51,10 +49,9 @@ public:
    * @param tensor_builders Tensor builders that are currently used
    */
   ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
-               const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-               const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-               const compiler::TensorRegistries &tensor_regs,
-               backend::TensorManagerSet &&tensor_mgrs);
+               const std::vector<backend::ITensor *> &input_tensors,
+               const std::vector<backend::ITensor *> &output_tensors,
+               const compiler::TensorRegistries &tensor_regs);
 
   virtual ~ExecutorBase() = default;
 
@@ -66,7 +63,7 @@ public:
    * @param src_tensor Tensor list that will be copied to input tensors of this
    * @param pre_fn The permutation function that copy from src_tensor to input tensors of this
    */
-  void execute(const std::vector<std::shared_ptr<backend::ITensor>> &src_tensors,
+  void execute(const std::vector<backend::ITensor *> &src_tensors,
                const std::shared_ptr<IPermuteFunction> &pre_fn);
 
   void execute(const IODescription &desc) final;
@@ -81,17 +78,9 @@ public:
 
   void addObserver(std::unique_ptr<IExecutionObserver> ref) { _subject.add(std::move(ref)); };
 
-  const std::vector<std::shared_ptr<backend::ITensor>> &getInputTensors() const
-  {
-    return _input_tensors;
-  }
-
-  const std::vector<std::shared_ptr<backend::ITensor>> &getOutputTensors() const
-  {
-    return _output_tensors;
-  }
+  const std::vector<backend::ITensor *> &getInputTensors() const { return _input_tensors; }
 
-  const DynAllocInfoMap &getInputsDynamicAllocInfo() const { return _input_to_dyn_alloc_info; }
+  const std::vector<backend::ITensor *> &getOutputTensors() const { return _output_tensors; }
 
 protected:
   /**
@@ -104,11 +93,8 @@ protected:
   std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
   std::unique_ptr<compiler::LoweredGraph> _lowered_graph;
   const ir::Graph &_graph;
-  std::vector<std::shared_ptr<backend::ITensor>> _input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> _output_tensors;
-  DynAllocInfoMap _input_to_dyn_alloc_info;
-  DynAllocInfoMap _output_to_dyn_alloc_info;
-  backend::TensorManagerSet _tensor_mgrs;
+  std::vector<backend::ITensor *> _input_tensors;
+  std::vector<backend::ITensor *> _output_tensors;
   std::mutex _mutex;
 
 private:
diff --git a/runtime/onert/core/src/exec/FunctionSequence.cc b/runtime/onert/core/src/exec/FunctionSequence.cc
index fb31f7582..8aefa5eeb 100644
--- a/runtime/onert/core/src/exec/FunctionSequence.cc
+++ b/runtime/onert/core/src/exec/FunctionSequence.cc
@@ -28,9 +28,11 @@ namespace exec
 
 void FunctionSequence::run()
 {
-  // TODO Find out when `_enable_dynamic_shape_inferer` is true but `_dynamic_tensor_ctx` is false
   if (_enable_dynamic_shape_inferer && _dynamic_tensor_ctx)
   {
+    // acl_cl and acl_neon backend don't support dynamic shape.
+    // _dynamic_tensor_ctx is always nullptr for acl_cl and acl_neon
+    // Thus, those two bakends cannot reach here.
     if (_dynamic_tensor_ctx->op_seq->size() != _functions.size())
       throw std::runtime_error("operation and functions should be mapped one by one");
 
@@ -61,11 +63,6 @@ void FunctionSequence::run()
   {
     for (const auto &function : _functions)
     {
-      auto *sub_func_seq = dynamic_cast<FunctionSequence *>(function.get());
-      if (sub_func_seq != nullptr)
-      {
-        sub_func_seq->enableDynamicShapeInferer(false);
-      }
       function->run();
     }
   }
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h
index 6b4d15380..94bc2e436 100644
--- a/runtime/onert/core/src/exec/IPermuteFunction.h
+++ b/runtime/onert/core/src/exec/IPermuteFunction.h
@@ -50,13 +50,13 @@ private:
 public:
   virtual void run() override
   {
-    assert(_src_tensors.size() > 0);
+    // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0)
     assert(_src_tensors.size() == _dst_tensors.size());
     auto src_it = _src_tensors.begin();
     auto dst_it = _dst_tensors.begin();
     while (src_it != _src_tensors.end())
     {
-      const auto src_tensor = *src_it;
+      auto src_tensor = *src_it;
       auto dst_tensor = *dst_it;
       if (src_tensor != dst_tensor)
       {
@@ -101,9 +101,8 @@ public:
   virtual void optimize() = 0;
 
 private:
-  template <class T>
-  void permute(const std::shared_ptr<backend::ITensor> &src, std::shared_ptr<backend::ITensor> &dst,
-               size_t rank)
+  // TODO make src const by proving const access()
+  template <class T> void permute(backend::ITensor *src, backend::ITensor *dst, size_t rank)
   {
     const auto permute_type = [&]() -> PermuteType {
       if (src->layout() == ir::Layout::NHWC && dst->layout() == ir::Layout::NCHW)
@@ -121,127 +120,65 @@ private:
     }();
     auto fn = [&](backend::ITensor &src_tensor) {
       dst->access([&](backend::ITensor &dst_tensor) {
-        auto src_buffer = src_tensor.buffer();
-        auto src_size = src_tensor.total_size();
-        auto dst_buffer = dst_tensor.buffer();
-        if (permute_type == PermuteType::COPY)
+        if (rank == 4 && permute_type != PermuteType::COPY)
         {
-          assert(src_tensor.layout() == dst_tensor.layout());
-          if (!src_tensor.has_padding() && !dst_tensor.has_padding())
+          switch (permute_type)
           {
-            assert(src_size <= dst_tensor.total_size());
-            memcpy(dst_buffer, src_buffer, src_size);
-            return;
-          }
-        }
-        switch (rank)
-        {
-          case 0:
-          case 1:
-          {
-            const int32_t copy_len = dst_tensor.dimension(0);
-
-            memcpy(dst_buffer, src_buffer, copy_len * sizeof(T));
-            break;
-          }
-          case 2:
-          {
-            const int32_t dim_0 = dst_tensor.dimension(0);
-            const int32_t copy_len = dst_tensor.dimension(1);
-
-            for (int32_t i = 0; i < dim_0; ++i)
+            case PermuteType::NHWC_TO_NCHW:
             {
-              ir::Coordinates coords{i, 0};
-              memcpy(dst_buffer + dst_tensor.calcOffset(coords),
-                     src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T));
+              ir::FeatureShape shape;
+              shape.N = dst_tensor.dimension(0);
+              shape.C = dst_tensor.dimension(1);
+              shape.H = dst_tensor.dimension(2);
+              shape.W = dst_tensor.dimension(3);
+              const feature::nhwc::Reader<T> from(&src_tensor);
+              feature::nchw::View<T> into(&dst_tensor);
+              feature::iterate(shape)
+                  << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+                       const auto value = from.at(batch, row, col, ch);
+                       into.at(batch, ch, row, col) = value;
+                     };
+              break;
             }
-            break;
-          }
-          case 3:
-          {
-            const int32_t dim_0 = dst_tensor.dimension(0);
-            const int32_t dim_1 = dst_tensor.dimension(1);
-            const int32_t copy_len = dst_tensor.dimension(2);
-
-            for (auto i = 0; i < dim_0; ++i)
+            case PermuteType::NCHW_TO_NHWC:
             {
-              for (auto j = 0; j < dim_1; ++j)
-              {
-                ir::Coordinates coords{i, j, 0};
-                memcpy(dst_buffer + dst_tensor.calcOffset(coords),
-                       src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T));
-              }
+              ir::FeatureShape shape;
+              shape.N = src_tensor.dimension(0);
+              shape.C = src_tensor.dimension(1);
+              shape.H = src_tensor.dimension(2);
+              shape.W = src_tensor.dimension(3);
+              const feature::nchw::Reader<T> from(&src_tensor);
+              feature::nhwc::View<T> into(&dst_tensor);
+              feature::iterate(shape)
+                  << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+                       const auto value = from.at(batch, ch, row, col);
+                       into.at(batch, row, col, ch) = value;
+                     };
+              break;
             }
-            break;
-          }
-          case 4:
-          {
-            switch (permute_type)
+            default:
             {
-              case PermuteType::NHWC_TO_NCHW:
-              {
-                ir::FeatureShape shape;
-                shape.N = dst_tensor.dimension(0);
-                shape.C = dst_tensor.dimension(1);
-                shape.H = dst_tensor.dimension(2);
-                shape.W = dst_tensor.dimension(3);
-                const feature::nhwc::Reader<T> from(&src_tensor);
-                feature::nchw::View<T> into(&dst_tensor);
-                feature::iterate(shape)
-                    << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                         const auto value = from.at(batch, row, col, ch);
-                         into.at(batch, ch, row, col) = value;
-                       };
-                break;
-              }
-              case PermuteType::NCHW_TO_NHWC:
-              {
-                ir::FeatureShape shape;
-                shape.N = src_tensor.dimension(0);
-                shape.C = src_tensor.dimension(1);
-                shape.H = src_tensor.dimension(2);
-                shape.W = src_tensor.dimension(3);
-                const feature::nchw::Reader<T> from(&src_tensor);
-                feature::nhwc::View<T> into(&dst_tensor);
-                feature::iterate(shape)
-                    << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                         const auto value = from.at(batch, ch, row, col);
-                         into.at(batch, row, col, ch) = value;
-                       };
-                break;
-              }
-              case PermuteType::COPY:
-              {
-                const int32_t dim_0 = dst_tensor.dimension(0);
-                const int32_t dim_1 = dst_tensor.dimension(1);
-                const int32_t dim_2 = dst_tensor.dimension(2);
-                const int32_t copy_len = dst_tensor.dimension(3);
-
-                for (auto i = 0; i < dim_0; ++i)
-                {
-                  for (auto j = 0; j < dim_1; ++j)
-                  {
-                    for (auto k = 0; k < dim_2; ++k)
-                    {
-                      ir::Coordinates coords{i, j, k, 0};
-                      memcpy(dst_buffer + dst_tensor.calcOffset(coords),
-                             src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T));
-                    }
-                  }
-                }
-                break;
-              }
-              default:
-              {
-                throw std::runtime_error("Unsupported Permutation");
-                break;
-              }
+              throw std::runtime_error("Unsupported Permutation");
+              break;
             }
-            break;
           }
-          default:
-            throw std::runtime_error("Unsupported rank in permutation");
-            break;
+        }
+        else if (!src_tensor.has_padding() && !dst_tensor.has_padding())
+        {
+          auto src_size = src_tensor.total_size();
+          assert(src_size <= dst_tensor.total_size());
+          memcpy(dst_tensor.buffer(), src_tensor.buffer(), src_size);
+        }
+        else
+        {
+          auto loop_shape = src_tensor.getShape();
+          const auto copy_axis = loop_shape.rank() - 1;
+          const auto copy_len = loop_shape.dim(copy_axis) * sizeof(T);
+          loop_shape.dim(copy_axis) = 1;
+          ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) {
+            memcpy(dst_tensor.buffer() + dst_tensor.calcOffset(coords),
+                   src_tensor.buffer() + src_tensor.calcOffset(coords), copy_len);
+          });
         }
       });
     };
@@ -275,8 +212,8 @@ private:
   }
 
 protected:
-  std::vector<std::shared_ptr<backend::ITensor>> _src_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> _dst_tensors;
+  std::vector<backend::ITensor *> _src_tensors;
+  std::vector<backend::ITensor *> _dst_tensors;
   // TODO Remove this member if it is possible
   std::vector<size_t> _ranks;
 };
diff --git a/runtime/onert/core/src/exec/LinearExecutor.cc b/runtime/onert/core/src/exec/LinearExecutor.cc
index 69dfe9b9b..6e6ca110f 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.cc
+++ b/runtime/onert/core/src/exec/LinearExecutor.cc
@@ -51,8 +51,10 @@ void LinearExecutor::executeImpl()
     _subject.notifyJobBegin(this, op_seq, backend);
 
     auto &fn_seq = code.fn_seq;
-    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput();
 
+    fn_seq->initRunning();
+
+    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput();
     fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
     fn_seq->run();
 
diff --git a/runtime/onert/core/src/exec/LinearExecutor.h b/runtime/onert/core/src/exec/LinearExecutor.h
index c224d3f4f..22d00ec30 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.h
+++ b/runtime/onert/core/src/exec/LinearExecutor.h
@@ -47,13 +47,11 @@ public:
    * @param code_map OpSequence and its code map
    */
   LinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                 const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                 const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                 const compiler::TensorRegistries &tensor_regs,
-                 backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map,
+                 const std::vector<backend::ITensor *> &input_tensors,
+                 const std::vector<backend::ITensor *> &output_tensors,
+                 const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
                  const std::vector<ir::OpSequenceIndex> &order)
-      : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
-                     std::move(tensor_mgrs)}
+      : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs}
   {
     for (auto index : order)
     {
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.cc b/runtime/onert/core/src/exec/ParallelExecutor.cc
index ab234aacd..676bdb5fa 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.cc
+++ b/runtime/onert/core/src/exec/ParallelExecutor.cc
@@ -59,14 +59,13 @@ void ParallelExecutor::notify(uint32_t finished_job_id)
   _cv_jobs.notify_all();
 }
 
-ParallelExecutor::ParallelExecutor(
-    std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-    const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-    const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-    const compiler::TensorRegistries &tensor_regs, backend::TensorManagerSet &&tensor_mgrs,
-    compiler::CodeMap &&code_map)
-    : DataflowExecutor{std::move(lowered_graph), input_tensors,      output_tensors, tensor_regs,
-                       std::move(tensor_mgrs),   std::move(code_map)}
+ParallelExecutor::ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                   const std::vector<backend::ITensor *> &input_tensors,
+                                   const std::vector<backend::ITensor *> &output_tensors,
+                                   const compiler::TensorRegistries &tensor_regs,
+                                   compiler::CodeMap &&code_map)
+    : DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
+                       std::move(code_map)}
 {
   VERBOSE(ParallelExecutor) << "Constructing Parallel Executor" << std::endl;
 }
@@ -133,6 +132,8 @@ void ParallelExecutor::executeImpl()
       notify(job_index);
     };
 
+    job->fn_seq()->initRunning();
+
     // dynamic tensor setting
     bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists;
     job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor);
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.h b/runtime/onert/core/src/exec/ParallelExecutor.h
index 929edfce9..111c20c0c 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.h
+++ b/runtime/onert/core/src/exec/ParallelExecutor.h
@@ -51,10 +51,9 @@ public:
    * @param code_map OpSequence and its code map
    */
   ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                   const compiler::TensorRegistries &tensor_regs,
-                   backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map);
+                   const std::vector<backend::ITensor *> &input_tensors,
+                   const std::vector<backend::ITensor *> &output_tensors,
+                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map);
 
   void executeImpl() override;
 
diff --git a/runtime/onert/core/src/exec/Sink.h b/runtime/onert/core/src/exec/Sink.h
deleted file mode 100644
index 6a99efe60..000000000
--- a/runtime/onert/core/src/exec/Sink.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_EXEC_SINK_H__
-#define __ONERT_EXEC_SINK_H__
-
-#include "feature/nchw/Reader.h"
-#include "feature/nchw/View.h"
-#include "feature/nhwc/Reader.h"
-#include "feature/nhwc/View.h"
-
-#include <cassert>
-#include <memory>
-#include "util/Utils.h"
-#include <misc/feature/IndexIterator.h>
-
-namespace onert
-{
-namespace exec
-{
-struct ISink
-{
-  virtual ~ISink() = default;
-
-  virtual void pull(::onert::backend::ITensor &tensor) const = 0;
-};
-
-// Create second lever inheritance: the first lever is used as a reference type in use-case places
-template <typename T> class ITemplSink : public ISink
-{
-public:
-  ITemplSink(void *output_buffer, const size_t &output_size, const ir::Shape &shape,
-             const bool copy, ir::Layout io_layout)
-      : _output_buffer{reinterpret_cast<T *>(output_buffer)}, _output_size{output_size},
-        _shape{shape}, _copy{copy}, _io_layout{io_layout}
-  {
-  }
-
-protected:
-  void pullUnif(onert::backend::ITensor &tensor) const
-  {
-    assert(((_io_layout == ir::Layout::NHWC && tensor.layout() == ir::Layout::NCHW) ||
-            (_io_layout == ir::Layout::NCHW && tensor.layout() == ir::Layout::NHWC)) ||
-           _copy);
-    auto input_buffer = tensor.buffer();
-    auto rank = _shape.rank();
-
-    if (!tensor.has_padding() && rank < 4 + _copy)
-    {
-      memcpy(_output_buffer, input_buffer, _output_size);
-      return;
-    }
-
-    switch (rank)
-    {
-      case 0:
-      case 1:
-      {
-        memcpy(_output_buffer, input_buffer, _output_size);
-        break;
-      }
-      case 2:
-      {
-        const int32_t copy_len = _shape.dim(1);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          ir::Coordinates coords{i, 0};
-          memcpy(_output_buffer + i * copy_len, input_buffer + tensor.calcOffset(coords),
-                 copy_len * sizeof(T));
-        }
-        break;
-      }
-      case 3:
-      {
-        const int32_t dim1 = _shape.dim(1);
-        const int32_t dim2 = _shape.dim(2);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          for (auto j = 0; j < _shape.dim(1); ++j)
-          {
-            ir::Coordinates coords{i, j, 0};
-            memcpy(_output_buffer + i * dim1 * dim2 + j * dim2,
-                   input_buffer + tensor.calcOffset(coords), dim2 * sizeof(T));
-          }
-        }
-        break;
-      }
-      case 4:
-      {
-        if (_copy)
-        {
-          const int32_t dim1 = _shape.dim(1);
-          const int32_t dim2 = _shape.dim(2);
-          const int32_t dim3 = _shape.dim(3);
-
-          for (auto i = 0; i < _shape.dim(0); ++i)
-          {
-            for (auto j = 0; j < _shape.dim(1); ++j)
-            {
-              for (auto k = 0; k < _shape.dim(2); ++k)
-              {
-                ir::Coordinates coords{i, j, k, 0};
-                memcpy(_output_buffer + i * dim1 * dim2 * dim3 + j * dim2 * dim3 + k * dim3,
-                       input_buffer + tensor.calcOffset(coords), dim3 * sizeof(T));
-              }
-            }
-          }
-        }
-        else
-        {
-          const auto shape = _shape.asFeature(_io_layout);
-
-          if (_io_layout == ir::Layout::NHWC)
-          {
-            const exec::feature::nchw::Reader<T> from(&tensor);
-            exec::feature::nhwc::View<T> into(shape, _output_buffer, _output_size);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, ch, row, col);
-                     into.at(batch, row, col, ch) = value;
-                   };
-          }
-          else if (_io_layout == ir::Layout::NCHW)
-          {
-            const exec::feature::nhwc::Reader<T> from(&tensor);
-            exec::feature::nchw::View<T> into(shape, _output_buffer, _output_size);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, row, col, ch);
-                     into.at(batch, ch, row, col) = value;
-                   };
-          }
-          else
-          {
-            throw std::runtime_error("Wrong Layout");
-          }
-        }
-        break;
-      }
-      default:
-        throw std::runtime_error("NYI: rank > 4");
-        break;
-    }
-  }
-
-private:
-  T *_output_buffer;
-  const size_t _output_size;
-  const ir::Shape _shape;
-  const bool _copy;
-  const ir::Layout _io_layout;
-};
-
-template <typename T> class PermutateSink final : public ITemplSink<T>
-{
-public:
-  PermutateSink(void *output_buffer, const size_t &output_size, const ir::Shape &shape,
-                ir::Layout io_layout)
-      : ITemplSink<T>(output_buffer, output_size, shape, false, io_layout)
-  {
-  }
-
-public:
-  void pull(onert::backend::ITensor &tensor) const override { ITemplSink<T>::pullUnif(tensor); }
-};
-
-// Only supports NHWC format front-end(NNAPI) now
-template <typename T> class CopySink final : public ITemplSink<T>
-{
-public:
-  CopySink(void *output_buffer, const size_t &output_size, const ir::Shape &shape,
-           ir::Layout io_layout = ir::Layout::UNKNOWN)
-      : ITemplSink<T>(output_buffer, output_size, shape, true, io_layout)
-  {
-  }
-
-public:
-  void pull(onert::backend::ITensor &tensor) const override { ITemplSink<T>::pullUnif(tensor); }
-};
-
-} // namespace exec
-} // namespace onert
-
-#endif // __ONERT_EXEC_SINK_H__
diff --git a/runtime/onert/core/src/exec/Source.h b/runtime/onert/core/src/exec/Source.h
deleted file mode 100644
index fb2be4dd8..000000000
--- a/runtime/onert/core/src/exec/Source.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_EXEC_SOURCE_H__
-#define __ONERT_EXEC_SOURCE_H__
-
-#include "feature/IndexIterator.h"
-#include "feature/nchw/Reader.h"
-#include "feature/nchw/View.h"
-#include "feature/nhwc/Reader.h"
-#include "feature/nhwc/View.h"
-
-#include <cassert>
-#include <memory>
-#include "util/Utils.h"
-#include <ir/Layout.h>
-#include "ir/Shape.h"
-
-namespace onert
-{
-namespace exec
-{
-
-struct ISource
-{
-  virtual ~ISource() = default;
-
-  virtual void push(::onert::backend::ITensor &tensor) const = 0;
-};
-
-// Create second lever inheritance: the first lever is used as a reference type in use-case places
-template <typename T> class ITemplSource : public ISource
-{
-public:
-  ITemplSource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape,
-               const bool copy, ir::Layout io_layout)
-      : _input_buffer{reinterpret_cast<const T *>(input_buffer)}, _input_size{input_size},
-        _shape{shape}, _copy(copy), _io_layout{io_layout}
-  {
-  }
-
-  virtual void push(::onert::backend::ITensor &tensor) const = 0;
-
-protected:
-  void pushUnif(onert::backend::ITensor &tensor) const
-  {
-    assert(((_io_layout == ir::Layout::NHWC && tensor.layout() == ir::Layout::NCHW) ||
-            (_io_layout == ir::Layout::NCHW && tensor.layout() == ir::Layout::NHWC)) ||
-           _copy);
-    auto output_buffer = tensor.buffer();
-    auto rank = _shape.rank();
-
-    if (!tensor.has_padding() && rank < 4 + _copy)
-    {
-      memcpy(output_buffer, _input_buffer, _input_size);
-      return;
-    }
-
-    switch (rank)
-    {
-      case 0:
-      case 1:
-      {
-        memcpy(output_buffer, _input_buffer, _input_size);
-        break;
-      }
-      case 2:
-      {
-        const int32_t copy_len = _shape.dim(1);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          ir::Coordinates coords{i, 0};
-          memcpy(output_buffer + tensor.calcOffset(coords), _input_buffer + i * copy_len,
-                 copy_len * sizeof(T));
-        }
-        break;
-      }
-      case 3:
-      {
-        const int32_t dim1 = _shape.dim(1);
-        const int32_t dim2 = _shape.dim(2);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          for (auto j = 0; j < _shape.dim(1); ++j)
-          {
-            ir::Coordinates coords{i, j, 0};
-            memcpy(output_buffer + tensor.calcOffset(coords),
-                   _input_buffer + i * dim1 * dim2 + j * dim2, dim2 * sizeof(T));
-          }
-        }
-        break;
-      }
-      case 4:
-      {
-        if (_copy)
-        {
-          const int32_t dim1 = _shape.dim(1);
-          const int32_t dim2 = _shape.dim(2);
-          const int32_t dim3 = _shape.dim(3);
-          for (auto i = 0; i < _shape.dim(0); ++i)
-          {
-            for (auto j = 0; j < _shape.dim(1); ++j)
-            {
-              for (auto k = 0; k < _shape.dim(2); ++k)
-              {
-                ir::Coordinates coords{i, j, k, 0};
-                memcpy(output_buffer + tensor.calcOffset(coords),
-                       _input_buffer + i * dim1 * dim2 * dim3 + j * dim2 * dim3 + k * dim3,
-                       dim3 * sizeof(T));
-              }
-            }
-          }
-        }
-        else
-        {
-          const auto shape = _shape.asFeature(_io_layout);
-
-          if (_io_layout == ir::Layout::NCHW)
-          {
-            const exec::feature::nchw::Reader<T> from(shape, _input_buffer, _input_size);
-            exec::feature::nhwc::View<T> into(&tensor);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, ch, row, col);
-                     into.at(batch, row, col, ch) = value;
-                   };
-          }
-          else if (_io_layout == ir::Layout::NHWC)
-          {
-            const exec::feature::nhwc::Reader<T> from(shape, _input_buffer, _input_size);
-            exec::feature::nchw::View<T> into(&tensor);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, row, col, ch);
-                     into.at(batch, ch, row, col) = value;
-                   };
-          }
-          else
-          {
-            throw std::runtime_error("Wrong Layout");
-          }
-        }
-
-        break;
-      }
-      default:
-        throw std::runtime_error("NYI: rank > 4");
-        break;
-    }
-  }
-
-private:
-  const T *_input_buffer;
-  const size_t _input_size;
-  const ir::Shape _shape;
-  const bool _copy;
-  const ir::Layout _io_layout;
-};
-
-template <typename T> class PermutateSource final : public ITemplSource<T>
-{
-public:
-  PermutateSource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape,
-                  ir::Layout io_layout)
-      : ITemplSource<T>(input_buffer, input_size, shape, false, io_layout)
-  {
-  }
-
-public:
-  void push(onert::backend::ITensor &tensor) const override
-  {
-    // do NHWC_TO_NCHW or NCHW_TO_NHWC permutation
-    ITemplSource<T>::pushUnif(tensor);
-  }
-};
-
-template <typename T> class CopySource final : public ITemplSource<T>
-{
-public:
-  CopySource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape,
-             ir::Layout io_layout = ir::Layout::UNKNOWN)
-      : ITemplSource<T>(input_buffer, input_size, shape, true, io_layout)
-  {
-  }
-
-public:
-  void push(onert::backend::ITensor &tensor) const override { ITemplSource<T>::pushUnif(tensor); }
-};
-
-} // namespace exec
-} // namespace onert
-
-#endif // __ONERT_EXEC_SOURCE_H__
diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h
index 008a4b9d4..8b72d537d 100644
--- a/runtime/onert/core/src/interp/Tensor.h
+++ b/runtime/onert/core/src/interp/Tensor.h
@@ -171,7 +171,6 @@ public:
   int32_t data_offset() const override { return _info.typeInfo().offset(); }
   const ir::OperandInfo &tensorInfo() const override { return _info; }
   uint64_t num_elements() const override { return _info.shape().num_elements(); };
-  backend::IDynamicTensorManager *dynamic_tensor_manager() override { return nullptr; }
 
 private:
   const ir::OperandInfo _info;
diff --git a/runtime/onert/core/src/ir/Graph.cc b/runtime/onert/core/src/ir/Graph.cc
index fe8b1b443..605562ebc 100644
--- a/runtime/onert/core/src/ir/Graph.cc
+++ b/runtime/onert/core/src/ir/Graph.cc
@@ -103,7 +103,7 @@ void Graph::initializeUseDef()
 {
   operations().iterate([&](const OperationIndex &index, const Operation &node) -> void {
     auto outputs = node.getOutputs();
-    for (auto output : outputs)
+    for (auto output : outputs | ir::Remove::UNDEFINED)
     {
       operands().at(output).setDef(index);
     }
diff --git a/runtime/onert/core/src/ir/GraphIterator.cc b/runtime/onert/core/src/ir/GraphIterator.cc
index 4bea1a55d..ac67771c4 100644
--- a/runtime/onert/core/src/ir/GraphIterator.cc
+++ b/runtime/onert/core/src/ir/GraphIterator.cc
@@ -53,7 +53,7 @@ void PostDfsIterator<is_const>::iterate(GraphRef graph, const IterFn &fn) const
       return;
     visited[index] = true;
 
-    for (const auto output : node.getOutputs() | Remove::DUPLICATED)
+    for (const auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       const auto &operand = graph.operands().at(output);
       for (const auto &use : operand.getUses())
@@ -86,7 +86,7 @@ void PostDfsIterator<is_const>::iterateOpSeqs(LoweredGraphRef lowered_graph,
       return;
     visited[index] = true;
 
-    for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED)
+    for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       const auto &operand = lowered_graph.graph().operands().at(output);
       for (const auto &use : operand.getUses())
diff --git a/runtime/onert/core/src/ir/Operation.cc b/runtime/onert/core/src/ir/Operation.cc
index 04be8c0d9..4af878541 100644
--- a/runtime/onert/core/src/ir/Operation.cc
+++ b/runtime/onert/core/src/ir/Operation.cc
@@ -24,22 +24,33 @@ namespace ir
 {
 
 Operation::Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs,
-                     const OperandIndexSequence &outputs)
-    : _input_constr{input_constr}, _inputs{inputs}, _outputs{outputs}
+                     const OperandIndexSequence &outputs, OperandConstraint output_constr)
+    : _input_constr{input_constr}, _output_constr{output_constr}
 {
+  setInputs(inputs);
+  setOutputs(outputs);
 }
 
-Operation::Operation(OperandConstraint input_constr) : _input_constr{input_constr} {}
+Operation::Operation(OperandConstraint input_constr, OperandConstraint output_constr)
+    : _input_constr{input_constr}, _output_constr{output_constr}
+{
+}
 
 Operation::~Operation() = default;
 
 void Operation::setInputs(const OperandIndexSequence &indexes)
 {
-  assert(_input_constr.check(indexes.size()));
+  if (!_input_constr.check(indexes.size()))
+    throw std::runtime_error{"Invalid number of input tensors for this operation."};
   _inputs = indexes;
 }
 
-void Operation::setOutputs(const OperandIndexSequence &indexes) { _outputs = indexes; }
+void Operation::setOutputs(const OperandIndexSequence &indexes)
+{
+  if (!_output_constr.check(indexes.size()))
+    throw std::runtime_error{"Invalid number of output tensors for this operation."};
+  _outputs = indexes;
+}
 
 void Operation::replaceInputs(const OperandIndex &from, const OperandIndex &to)
 {
diff --git a/runtime/onert/core/src/ir/OperationDumper.cc b/runtime/onert/core/src/ir/OperationDumper.cc
index 48361f464..eecfe81cc 100644
--- a/runtime/onert/core/src/ir/OperationDumper.cc
+++ b/runtime/onert/core/src/ir/OperationDumper.cc
@@ -40,7 +40,7 @@ void dumpUnaryInputOp(const Operation &node, const std::string &adding_input = "
 void dumpBinaryInputOp(const Operation &node, const std::string &adding_input = "")
 {
   VERBOSE(LIR) << "* " << node.name() << std::endl;
-  VERBOSE(LIR) << "  - Inputs : Input(" << node.getInputs().at(0) << ", " << node.getInputs().at(0)
+  VERBOSE(LIR) << "  - Inputs : Input(" << node.getInputs().at(0) << ", " << node.getInputs().at(1)
                << ") " << adding_input << std::endl;
   VERBOSE(LIR) << "  - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl;
 }
@@ -72,7 +72,7 @@ OperationDumper::OperationDumper(const std::string &start_msg)
   VERBOSE(LIR) << start_msg << std::endl;
 }
 
-void OperationDumper::visit(const ArgMax &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const ArgMax &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const BatchToSpaceND &node)
 {
@@ -82,6 +82,20 @@ void OperationDumper::visit(const BatchToSpaceND &node)
   dumpUnaryInputOp(node, block_size);
 }
 
+void OperationDumper::visit(const BCQFullyConnected &node)
+{
+  VERBOSE(LIR) << "* " << node.name() << std::endl;
+  VERBOSE(LIR) << "  - Inputs : IFM(" << node.getInputs().at(BCQFullyConnected::Input::INPUT)
+               << ") WeightsBinary("
+               << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_BINARY)
+               << ") WeightsScales("
+               << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_SCALES)
+               << ") WeightsClusters("
+               << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_CLUSTERS) << ") Bias("
+               << node.getInputs().at(BCQFullyConnected::Input::BIAS) << ")" << std::endl;
+  VERBOSE(LIR) << "  - Output : OFM(" << node.getOutputs().at(0) << ")" << std::endl;
+}
+
 void OperationDumper::visit(const BinaryArithmetic &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const operation::BroadcastTo &node) { dumpBinaryInputOp(node); }
@@ -185,6 +199,7 @@ void OperationDumper::visit(const LocalResponseNormalization &node) { dumpUnaryI
 
 void OperationDumper::visit(const LSTM &node)
 {
+  VERBOSE(LIR) << "* " << node.name() << std::endl;
   VERBOSE(LIR)
       << "  - Inputs : Input(" << node.getInputs().at(LSTM::Input::INPUT)
       << ") Input To Input Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_INPUT_WEIGHTS)
@@ -209,12 +224,24 @@ void OperationDumper::visit(const LSTM &node)
       << node.getInputs().at(LSTM::Input::PROJECTION_WEIGHTS) << ") Projection Bias("
       << node.getInputs().at(LSTM::Input::PROJECTION_BIAS) << ") Output State In("
       << node.getInputs().at(LSTM::Input::OUTPUT_STATE_IN) << ") Cell State In("
-      << node.getInputs().at(LSTM::Input::CELL_STATE_IN) << ")" << std::endl;
+      << node.getInputs().at(LSTM::Input::CELL_STATE_IN);
+  if (node.getInputs().size() == 24)
+  {
+    VERBOSE(LIR) << ") Input Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::INPUT_LAYER_NORMALIZATION_WEIGHTS)
+                 << ") Forget Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::FORGET_LAYER_NORMALIZATION_WEIGHTS)
+                 << ") Cell Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::CELL_LAYER_NORMALIZATION_WEIGHTS)
+                 << ") Ouput Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::OUTPUT_LAYER_NORMALIZATION_WEIGHTS);
+  }
+  VERBOSE(LIR) << ")" << std::endl;
   VERBOSE(LIR) << "  - Output : Scratch Buffer("
                << node.getOutputs().at(LSTM::Output::SCRATCH_BUFFER) << ") Output State Out("
-               << node.getInputs().at(LSTM::Output::OUTPUT_STATE_OUT) << ") Cell State Out("
-               << node.getInputs().at(LSTM::Output::CELL_STATE_OUT) << ") Output("
-               << node.getInputs().at(LSTM::Output::OUTPUT) << ")" << std::endl;
+               << node.getOutputs().at(LSTM::Output::OUTPUT_STATE_OUT) << ") Cell State Out("
+               << node.getOutputs().at(LSTM::Output::CELL_STATE_OUT) << ") Output("
+               << node.getOutputs().at(LSTM::Output::OUTPUT) << ")" << std::endl;
 }
 
 void OperationDumper::visit(const Pack &node) { dumpPackingOp(node); }
@@ -279,7 +306,37 @@ void OperationDumper::visit(const Reshape &node)
   dumpUnaryInputOp(node, shape);
 }
 
-void OperationDumper::visit(const ResizeBilinear &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const ResizeBilinear &node)
+{
+  if (node.getInputs().size() == 1)
+  {
+    dumpUnaryInputOp(node);
+  }
+  else if (node.getInputs().size() == 2)
+  {
+    dumpBinaryInputOp(node);
+  }
+  else
+  {
+    VERBOSE(LIR) << "* " << node.name() << " is set wrong" << std::endl;
+  }
+}
+
+void OperationDumper::visit(const ResizeNearestNeighbor &node)
+{
+  if (node.getInputs().size() == 1)
+  {
+    dumpUnaryInputOp(node);
+  }
+  else if (node.getInputs().size() == 2)
+  {
+    dumpBinaryInputOp(node);
+  }
+  else
+  {
+    VERBOSE(LIR) << "* " << node.name() << " is set wrong" << std::endl;
+  }
+}
 
 void OperationDumper::visit(const Reverse &node)
 {
@@ -336,7 +393,7 @@ void OperationDumper::visit(const SpaceToBatchND &node)
 
 void OperationDumper::visit(const SpaceToDepth &node) { dumpUnaryInputOp(node); }
 
-void OperationDumper::visit(const Split &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const Split &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const SquaredDifference &node) { dumpBinaryInputOp(node); }
 
@@ -384,7 +441,7 @@ void OperationDumper::visit(const TransposeConv &node)
   VERBOSE(LIR) << "  - Output : OFM(" << node.getOutputs().at(0) << ")" << std::endl;
 }
 
-void OperationDumper::visit(const Transpose &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const Transpose &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const Unpack &node)
 {
diff --git a/runtime/onert/core/src/ir/OperationDumper.h b/runtime/onert/core/src/ir/OperationDumper.h
index e8ab3b3cd..91642ab13 100644
--- a/runtime/onert/core/src/ir/OperationDumper.h
+++ b/runtime/onert/core/src/ir/OperationDumper.h
@@ -33,6 +33,7 @@ public:
 public:
   void visit(const operation::ArgMax &) override;
   void visit(const operation::BatchToSpaceND &node) override;
+  void visit(const operation::BCQFullyConnected &node) override;
   void visit(const operation::BinaryArithmetic &node) override;
   void visit(const operation::BroadcastTo &) override;
   void visit(const operation::Comparison &) override;
@@ -65,6 +66,7 @@ public:
   void visit(const operation::Reduce &) override;
   void visit(const operation::Reshape &node) override;
   void visit(const operation::ResizeBilinear &) override;
+  void visit(const operation::ResizeNearestNeighbor &) override;
   void visit(const operation::Reverse &) override;
   void visit(const operation::RNN &) override;
   void visit(const operation::Select &node) override;
diff --git a/runtime/onert/core/src/ir/operation/ArgMax.cc b/runtime/onert/core/src/ir/operation/ArgMax.cc
index 1275ae43a..f3bd8fd73 100644
--- a/runtime/onert/core/src/ir/operation/ArgMax.cc
+++ b/runtime/onert/core/src/ir/operation/ArgMax.cc
@@ -31,7 +31,7 @@ void ArgMax::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ArgMax::ArgMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
index 9ef2b125f..34be79dd2 100644
--- a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
+++ b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
@@ -31,7 +31,7 @@ void BatchToSpaceND::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BatchToSpaceND::BatchToSpaceND(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
index 7dfcd4a98..6a0be7eb8 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
@@ -32,7 +32,9 @@ void ElementwiseUnary::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ElementwiseUnary::ElementwiseUnary(const OperandIndexSequence &inputs,
                                    const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createExact(1u), inputs, outputs,
+                OperandConstraint::createExact(1u)},
+      _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Fill.cc b/runtime/onert/core/src/ir/operation/Fill.cc
index c44f45aab..b8b97d1c0 100644
--- a/runtime/onert/core/src/ir/operation/Fill.cc
+++ b/runtime/onert/core/src/ir/operation/Fill.cc
@@ -30,7 +30,7 @@ namespace operation
 void Fill::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Fill::Fill(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/FullyConnected.cc b/runtime/onert/core/src/ir/operation/FullyConnected.cc
index 118ae554a..9837a3137 100644
--- a/runtime/onert/core/src/ir/operation/FullyConnected.cc
+++ b/runtime/onert/core/src/ir/operation/FullyConnected.cc
@@ -31,7 +31,7 @@ void FullyConnected::accept(OperationVisitor &v) const { v.visit(*this); }
 
 FullyConnected::FullyConnected(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/LSTM.cc b/runtime/onert/core/src/ir/operation/LSTM.cc
index 30a865326..5cd7c793a 100644
--- a/runtime/onert/core/src/ir/operation/LSTM.cc
+++ b/runtime/onert/core/src/ir/operation/LSTM.cc
@@ -31,10 +31,18 @@ void LSTM::accept(OperationVisitor &v) const { v.visit(*this); }
 
 LSTM::LSTM(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
            const Param &param)
-    : Operation{OperandConstraint::createExact(23u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(20u, 24u), inputs, outputs}, _param{param}
 {
 }
 
+std::string LSTM::name() const
+{
+  if (getOutputs().at(Output::SCRATCH_BUFFER).undefined())
+    return std::string{"UnidirectionalSequenceLSTM"};
+  else
+    return Operation::name();
+}
+
 } // namespace operation
 } // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/operation/Pack.cc b/runtime/onert/core/src/ir/operation/Pack.cc
index f0908a2c6..784d4162a 100644
--- a/runtime/onert/core/src/ir/operation/Pack.cc
+++ b/runtime/onert/core/src/ir/operation/Pack.cc
@@ -25,7 +25,7 @@ namespace operation
 void Pack::accept(OperationVisitor &v) const { v.visit(*this); }
 Pack::Pack(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
            const Param &param)
-    : Operation{OperandConstraint::createAtLeast(3u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
index d0d89f45f..71925bb44 100644
--- a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
+++ b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
@@ -31,7 +31,7 @@ void ResizeBilinear::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ResizeBilinear::ResizeBilinear(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
index 9f17af97c..98d0b5f26 100644
--- a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
+++ b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
@@ -32,7 +32,7 @@ void ResizeNearestNeighbor::accept(OperationVisitor &v) const { v.visit(*this);
 ResizeNearestNeighbor::ResizeNearestNeighbor(const OperandIndexSequence &inputs,
                                              const OperandIndexSequence &outputs,
                                              const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Split.cc b/runtime/onert/core/src/ir/operation/Split.cc
index 244884e41..b538e9206 100644
--- a/runtime/onert/core/src/ir/operation/Split.cc
+++ b/runtime/onert/core/src/ir/operation/Split.cc
@@ -25,7 +25,7 @@ namespace operation
 void Split::accept(OperationVisitor &v) const { v.visit(*this); }
 Split::Split(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
              const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/Transpose.cc b/runtime/onert/core/src/ir/operation/Transpose.cc
index 3a663fbce..997f98ab0 100644
--- a/runtime/onert/core/src/ir/operation/Transpose.cc
+++ b/runtime/onert/core/src/ir/operation/Transpose.cc
@@ -29,9 +29,8 @@ namespace operation
 
 void Transpose::accept(OperationVisitor &v) const { v.visit(*this); }
 
-Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
-                     const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/verifier/Verifier.cc b/runtime/onert/core/src/ir/verifier/Verifier.cc
index 09cbdcf2f..489845971 100644
--- a/runtime/onert/core/src/ir/verifier/Verifier.cc
+++ b/runtime/onert/core/src/ir/verifier/Verifier.cc
@@ -51,7 +51,7 @@ bool DAGChecker::verify(const Graph &graph) const noexcept
     visited[index] = true;
     on_stack[index] = true;
 
-    for (auto output : node.getOutputs() | Remove::DUPLICATED)
+    for (auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       const auto &operand = graph.operands().at(output);
       for (const auto &use : operand.getUses())
@@ -99,7 +99,7 @@ bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept
         errors += 1;
       }
     }
-    for (auto operand_index : node.getOutputs())
+    for (auto operand_index : node.getOutputs() | ir::Remove::UNDEFINED)
     {
       try
       {
diff --git a/runtime/onert/core/src/util/EventCollectorGlobal.cc b/runtime/onert/core/src/util/EventCollectorGlobal.cc
index d09b95210..6c03a5b9a 100644
--- a/runtime/onert/core/src/util/EventCollectorGlobal.cc
+++ b/runtime/onert/core/src/util/EventCollectorGlobal.cc
@@ -21,6 +21,7 @@
 #include <iostream>
 
 #include "util/ConfigSource.h"
+#include "util/EventWriter.h"
 
 namespace onert
 {
@@ -39,8 +40,8 @@ EventCollectorGlobal::~EventCollectorGlobal()
     try
     {
       // TODO Need better way for saved file path than the hardcoded path
-      std::ofstream ofs{"trace.global.json"};
-      _recorder.writeToFile(ofs);
+      EventWriter{_recorder}.writeToFile("trace.global.json",
+                                         EventWriter::WriteFormat::CHROME_TRACING);
     }
     catch (const std::exception &e)
     {
diff --git a/runtime/onert/core/src/util/EventRecorder.cc b/runtime/onert/core/src/util/EventRecorder.cc
index 13a599bed..3714e4f02 100644
--- a/runtime/onert/core/src/util/EventRecorder.cc
+++ b/runtime/onert/core/src/util/EventRecorder.cc
@@ -16,389 +16,6 @@
 
 #include "util/EventRecorder.h"
 
-#include <sstream>
-#include <vector>
-#include <unordered_map>
-#include <json/json.h>
-#include <assert.h>
-#include <utility>
-#include <map>
-#include <set>
-#include <stdint.h>
-
-// json type for Chrome Event Trace
-namespace
-{
-
-std::string quote(const std::string &value)
-{
-  std::stringstream ss;
-  ss << '"' << value << '"';
-  return ss.str();
-}
-
-std::string field(const std::string &k, const std::string &v)
-{
-  std::stringstream ss;
-  ss << quote(k) << " : " << quote(v);
-  return ss.str();
-}
-
-struct Content // One Entry in Chrome Event Trace
-{
-  std::vector<std::pair<std::string, std::string>> flds;
-  std::vector<std::pair<std::string, std::string>> args;
-};
-
-std::string object(const Content &content)
-{
-  std::stringstream ss;
-
-  ss << "{ ";
-
-  ss << field(content.flds[0].first, content.flds[0].second);
-
-  for (uint32_t n = 1; n < content.flds.size(); ++n)
-  {
-    ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second);
-  }
-
-  if (content.args.size() > 0)
-  {
-    ss << ", " << quote("args") << " : { ";
-    ss << field(content.args.at(0).first, content.args.at(0).second);
-
-    for (uint32_t n = 1; n < content.args.size(); ++n)
-    {
-      ss << ", " << field(content.args.at(n).first, content.args.at(n).second);
-    }
-
-    ss << "}";
-  }
-
-  ss << " }";
-
-  return ss.str();
-}
-
-void fill(Content &content, const Event &evt)
-{
-  content.flds.emplace_back("name", evt.name);
-  content.flds.emplace_back("pid", "0");
-  content.flds.emplace_back("tid", evt.tid);
-  content.flds.emplace_back("ph", evt.ph);
-  content.flds.emplace_back("ts", evt.ts);
-}
-
-std::string object(const DurationEvent &evt)
-{
-  Content content;
-
-  fill(content, evt);
-
-  return ::object(content);
-}
-
-std::string object(const CounterEvent &evt)
-{
-  Content content;
-
-  fill(content, evt);
-
-  for (auto it = evt.values.begin(); it != evt.values.end(); ++it)
-  {
-    content.args.emplace_back(it->first, it->second);
-  }
-
-  return ::object(content);
-}
-
-} // namespace
-
-// md table type
-namespace
-{
-
-void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
-{
-  os << "| ";
-  for (auto &key : list)
-  {
-    os << key << " | ";
-  }
-  os << "\n";
-}
-
-struct MDContent
-{
-  std::string name;
-  uint64_t begin_ts;
-  uint64_t end_ts;
-  uint32_t min_rss;
-  uint32_t max_rss;
-  uint32_t min_page_reclaims;
-  uint32_t max_page_reclaims;
-
-  MDContent()
-      : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX),
-        max_page_reclaims(0)
-  {
-    // DO NOTHING
-  }
-
-  virtual ~MDContent() = default;
-
-  void updateRss(uint32_t rss)
-  {
-    if (min_rss == UINT32_MAX)
-      min_rss = rss;
-    if (max_rss == 0)
-      max_rss = rss;
-
-    if (min_rss > rss)
-      min_rss = rss;
-    else if (max_rss < rss)
-      max_rss = rss;
-  }
-
-  void updateMinflt(uint32_t minflt)
-  {
-    if (min_page_reclaims == UINT32_MAX)
-      min_page_reclaims = minflt;
-    if (max_page_reclaims == 0)
-      max_page_reclaims = minflt;
-
-    if (min_page_reclaims > minflt)
-      min_page_reclaims = minflt;
-    else if (max_page_reclaims < minflt)
-      max_page_reclaims = minflt;
-  }
-
-  virtual void write(std::ostream &os) const = 0;
-};
-
-struct OpSeq : public MDContent
-{
-  std::string backend;
-  uint64_t graph_latency;
-
-  struct OpSeqCmp
-  {
-    bool operator()(const OpSeq &lhs, const OpSeq &rhs) const
-    {
-      return lhs.begin_ts < rhs.begin_ts;
-    }
-    bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
-    bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
-  };
-
-  void write(std::ostream &os) const override
-  {
-    uint64_t opseq_latency = end_ts - begin_ts;
-    double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0;
-    writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per),
-                         std::to_string(min_rss), std::to_string(max_rss),
-                         std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)});
-  }
-};
-
-struct Graph : public MDContent
-{
-  std::set<OpSeq, OpSeq::OpSeqCmp> opseqs;
-
-  void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq)
-  {
-    uint64_t graph_latency = end_ts - begin_ts;
-    for (auto it : name_to_opseq)
-    {
-      auto opseq = it.second;
-      opseq.graph_latency = graph_latency;
-
-      opseqs.insert(opseq);
-
-      updateRss(opseq.min_rss);
-      updateRss(opseq.max_rss);
-      updateMinflt(opseq.min_page_reclaims);
-      updateMinflt(opseq.max_page_reclaims);
-    }
-  }
-
-  void write(std::ostream &os) const override
-  {
-    static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)",
-                                                  "page_reclaims_min", "page_reclaims_max"};
-
-    static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------",
-                                                       "-----------------", "-----------------"};
-
-    // Graph's Header
-    writeMDTableRow(os, graph_headers);
-    writeMDTableRow(os, graph_headers_line);
-
-    // Graph's contents
-    writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss),
-                         std::to_string(max_rss), std::to_string(min_page_reclaims),
-                         std::to_string(max_page_reclaims)});
-
-    os << "\n";
-
-    static std::vector<std::string> opseq_headers{
-        "OpSeq name",  "backend",     "latency(us)",       "latency(%)",
-        "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"};
-
-    static std::vector<std::string> opseq_headers_line{
-        "----------", "-------", "-----------",       "-----------",
-        "-------",    "-------", "-----------------", "-----------------"};
-
-    os << "## OpSequences \n";
-
-    // OpSeq's Header
-    writeMDTableRow(os, opseq_headers);
-    writeMDTableRow(os, opseq_headers_line);
-
-    // OpSeq's contents
-    for (auto opseq : opseqs)
-    {
-      opseq.write(os);
-    }
-
-    os << "\n";
-  }
-};
-
-struct MDTableBuilder
-{
-  MDTableBuilder(const std::vector<DurationEvent> &duration_events,
-                 const std::vector<CounterEvent> &counter_events)
-      : _duration_events(duration_events), _counter_events(counter_events)
-  {
-    for (const auto &evt : _counter_events)
-    {
-      uint64_t ts = std::stoull(evt.ts);
-      auto &name = evt.name;
-      assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0);
-      assert(evt.values.size() == 1);
-      auto &val = evt.values.begin()->second;
-      if (_ts_to_values.find(ts) == _ts_to_values.end())
-      {
-        std::pair<uint32_t, uint32_t> values;
-        if (name.compare("maxrss") == 0)
-          values.first = std::stoul(val);
-        else
-          values.second = std::stoul(val);
-        _ts_to_values.insert({ts, values});
-      }
-      else
-      {
-        auto &values = _ts_to_values.at(ts);
-        if (name.compare("maxrss") == 0)
-          values.first = std::stoul(val);
-        else
-          values.second = std::stoul(val);
-      }
-    }
-  }
-
-  MDTableBuilder &build()
-  {
-    for (auto &it : divideGraph())
-    {
-      size_t begin_idx = it.first;
-      size_t end_idx = it.second;
-      std::map<std::string, OpSeq> name_to_opseq;
-      for (size_t i = begin_idx + 1; i < end_idx; ++i)
-      {
-        const auto &evt = _duration_events[i];
-        assert(evt.name.compare("Graph") != 0);
-        assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0);
-        if (evt.ph.compare("B") == 0)
-        {
-          assert(name_to_opseq.find(evt.name) == name_to_opseq.end());
-          name_to_opseq.insert({evt.name, makeOpSeq(evt)});
-        }
-        else
-        {
-          assert(name_to_opseq.find(evt.name) != name_to_opseq.end());
-          auto &opseq = name_to_opseq.at(evt.name);
-          updateOpSeq(opseq, evt);
-        }
-      }
-
-      _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq));
-    }
-
-    return *this;
-  }
-
-  std::vector<std::pair<size_t, size_t>> divideGraph()
-  {
-    std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx>
-    for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i)
-    {
-      const auto &evt = _duration_events.at(i);
-      if (evt.name.compare("Graph") == 0)
-      {
-        if (evt.ph.compare("B") == 0)
-          begin_idx = i;
-        else
-          graph_idx_list.emplace_back(begin_idx, i);
-      }
-    }
-    return graph_idx_list;
-  }
-
-  OpSeq makeOpSeq(const DurationEvent &evt)
-  {
-    OpSeq opseq;
-    opseq.name = evt.name;
-    opseq.begin_ts = std::stoull(evt.ts);
-    opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first);
-    opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second);
-    opseq.backend = evt.tid;
-    return opseq;
-  }
-
-  void updateOpSeq(OpSeq &opseq, const DurationEvent &evt)
-  {
-    opseq.end_ts = std::stoull(evt.ts);
-    opseq.updateRss(_ts_to_values.at(opseq.end_ts).first);
-    opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second);
-  }
-
-  Graph makeGraph(size_t begin_idx, size_t end_idx,
-                  const std::map<std::string, OpSeq> &name_to_opseq)
-  {
-    Graph graph;
-    graph.name = "Graph";
-    graph.begin_ts = std::stoull(_duration_events[begin_idx].ts);
-    graph.updateRss(_ts_to_values.at(graph.begin_ts).first);
-    graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second);
-    graph.end_ts = std::stoull(_duration_events[end_idx].ts);
-    graph.updateRss(_ts_to_values.at(graph.end_ts).first);
-    graph.updateMinflt(_ts_to_values.at(graph.end_ts).second);
-    graph.setOpSeqs(name_to_opseq);
-    return graph;
-  }
-
-  void write(std::ostream &os)
-  {
-    // Write contents
-    for (size_t i = 0; i < _graphs.size(); ++i)
-    {
-      os << "# Graph " << i << "\n";
-      _graphs.at(i).write(os);
-    }
-  }
-
-  const std::vector<DurationEvent> &_duration_events;
-  const std::vector<CounterEvent> &_counter_events;
-  // timestamp to std::pair<maxrss, minflt>
-  std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values;
-  std::vector<Graph> _graphs;
-};
-
-} // namespace
-
 void EventRecorder::emit(const DurationEvent &evt)
 {
   std::lock_guard<std::mutex> lock{_mu};
@@ -412,146 +29,3 @@ void EventRecorder::emit(const CounterEvent &evt)
 
   _counter_events.push_back(evt);
 }
-
-void EventRecorder::writeToFile(std::ostream &os)
-{
-  std::lock_guard<std::mutex> lock{_mu};
-
-  switch (_write_format)
-  {
-    case WriteFormat::CHROME_TRACING:
-      writeChromeTrace(os);
-      break;
-    case WriteFormat::SNPE_BENCHMARK:
-      writeSNPEBenchmark(os);
-      break;
-    case WriteFormat::MD_TABLE:
-      writeMDTable(os);
-      break;
-    default:
-      assert(!"Invalid value");
-      break;
-  }
-}
-
-void EventRecorder::writeSNPEBenchmark(std::ostream &os)
-{
-  Json::Value root;
-  auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
-
-  struct Stat
-  {
-    uint64_t sum = 0;
-    uint64_t count = 0;
-    uint64_t max = 0;
-    uint64_t min = std::numeric_limits<uint64_t>::max();
-
-    void accumulate(uint64_t val)
-    {
-      sum += val;
-      count++;
-      max = std::max(max, val);
-      min = std::min(min, val);
-    }
-  };
-
-  // Memory
-  {
-    std::unordered_map<std::string, Stat> mem_stats;
-    for (auto &evt : _counter_events)
-    {
-      auto &mem_stat = mem_stats[evt.name];
-      uint64_t val = std::stoull(evt.values["value"]);
-      mem_stat.accumulate(val);
-    }
-
-    auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
-    for (auto &kv : mem_stats)
-    {
-      auto &key = kv.first;
-      auto &val = kv.second;
-      mem[key]["Avg_Size"] = val.sum / val.count;
-      mem[key]["Max_Size"] = val.max;
-      mem[key]["Min_Size"] = val.min;
-      mem[key]["Runtime"] = "NA";
-    }
-  }
-
-  // Operation Execution Time
-  {
-    // NOTE This assumes _duration_events is sorted by "ts" ascending
-
-    // 2D keys : stats[tid][name]
-    std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
-    std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
-    for (auto &evt : _duration_events)
-    {
-      auto &stat = stats[evt.tid][evt.name];
-      auto &begin_ts = begin_timestamps[evt.tid][evt.name];
-      uint64_t timestamp = std::stoull(evt.ts);
-      if (evt.ph == "B")
-      {
-        if (begin_ts != 0)
-          throw std::runtime_error{"Invalid Data"};
-        begin_ts = timestamp;
-      }
-      else if (evt.ph == "E")
-      {
-        if (begin_ts == 0 || timestamp < begin_ts)
-          throw std::runtime_error{"Invalid Data"};
-        stat.accumulate(timestamp - begin_ts);
-        begin_ts = 0;
-      }
-      else
-        throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
-    }
-
-    for (auto &kv : begin_timestamps)
-      for (auto &kv2 : kv.second)
-        if (kv2.second != 0)
-          throw std::runtime_error{"Invalid Data - B and E pair does not match."};
-
-    for (auto &kv : stats)
-    {
-      auto &tid = kv.first;
-      auto &map = kv.second;
-      auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
-      for (auto &kv : map)
-      {
-        auto &name = kv.first;
-        auto &val = kv.second;
-        json_tid[name]["Avg_Time"] = val.sum / val.count;
-        json_tid[name]["Max_Time"] = val.max;
-        json_tid[name]["Min_Time"] = val.min;
-        json_tid[name]["Runtime"] = tid;
-      }
-    }
-  }
-
-  os << root;
-}
-
-void EventRecorder::writeChromeTrace(std::ostream &os)
-{
-  os << "{\n";
-  os << "  " << quote("traceEvents") << ": [\n";
-
-  for (auto &evt : _duration_events)
-  {
-    os << "    " << object(evt) << ",\n";
-  }
-
-  for (auto &evt : _counter_events)
-  {
-    os << "    " << object(evt) << ",\n";
-  }
-
-  os << "    { }\n";
-  os << "  ]\n";
-  os << "}\n";
-}
-
-void EventRecorder::writeMDTable(std::ostream &os)
-{
-  MDTableBuilder(_duration_events, _counter_events).build().write(os);
-}
diff --git a/runtime/onert/core/src/util/EventRecorder.h b/runtime/onert/core/src/util/EventRecorder.h
index 37ec1a0f1..7af4c7ddb 100644
--- a/runtime/onert/core/src/util/EventRecorder.h
+++ b/runtime/onert/core/src/util/EventRecorder.h
@@ -21,7 +21,6 @@
 #include <memory>
 #include <mutex>
 
-#include <ostream>
 #include <vector>
 
 struct Event
@@ -50,14 +49,6 @@ struct CounterEvent : public Event
 class EventRecorder
 {
 public:
-  enum class WriteFormat
-  {
-    CHROME_TRACING,
-    SNPE_BENCHMARK,
-    MD_TABLE,
-  };
-
-public:
   EventRecorder() = default;
 
 public:
@@ -66,18 +57,11 @@ public:
 
 public:
   bool empty() { return _duration_events.empty() && _counter_events.empty(); }
-  void writeToFile(std::ostream &os);
-  void setWriteFormat(WriteFormat write_format) { _write_format = write_format; }
-
-private:
-  void writeSNPEBenchmark(std::ostream &os);
-  void writeChromeTrace(std::ostream &os);
-  void writeMDTable(std::ostream &os);
+  const std::vector<DurationEvent> &duration_events() const { return _duration_events; }
+  const std::vector<CounterEvent> &counter_events() const { return _counter_events; }
 
 private:
   std::mutex _mu;
-  // TODO: Allow user to control write_format
-  WriteFormat _write_format{WriteFormat::SNPE_BENCHMARK};
   std::vector<DurationEvent> _duration_events;
   std::vector<CounterEvent> _counter_events;
 };
diff --git a/runtime/onert/core/src/util/EventWriter.cc b/runtime/onert/core/src/util/EventWriter.cc
new file mode 100644
index 000000000..dacb40e64
--- /dev/null
+++ b/runtime/onert/core/src/util/EventWriter.cc
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/EventWriter.h"
+
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <json/json.h>
+#include <assert.h>
+#include <utility>
+#include <map>
+#include <set>
+#include <stdint.h>
+#include <fstream>
+
+// json type for Chrome Event Trace
+namespace
+{
+
+std::string quote(const std::string &value)
+{
+  std::stringstream ss;
+  ss << '"' << value << '"';
+  return ss.str();
+}
+
+std::string field(const std::string &k, const std::string &v)
+{
+  std::stringstream ss;
+  ss << quote(k) << " : " << quote(v);
+  return ss.str();
+}
+
+struct Content // One Entry in Chrome Event Trace
+{
+  std::vector<std::pair<std::string, std::string>> flds;
+  std::vector<std::pair<std::string, std::string>> args;
+};
+
+std::string object(const Content &content)
+{
+  std::stringstream ss;
+
+  ss << "{ ";
+
+  ss << field(content.flds[0].first, content.flds[0].second);
+
+  for (uint32_t n = 1; n < content.flds.size(); ++n)
+  {
+    ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second);
+  }
+
+  if (content.args.size() > 0)
+  {
+    ss << ", " << quote("args") << " : { ";
+    ss << field(content.args.at(0).first, content.args.at(0).second);
+
+    for (uint32_t n = 1; n < content.args.size(); ++n)
+    {
+      ss << ", " << field(content.args.at(n).first, content.args.at(n).second);
+    }
+
+    ss << "}";
+  }
+
+  ss << " }";
+
+  return ss.str();
+}
+
+void fill(Content &content, const Event &evt)
+{
+  content.flds.emplace_back("name", evt.name);
+  content.flds.emplace_back("pid", "0");
+  content.flds.emplace_back("tid", evt.tid);
+  content.flds.emplace_back("ph", evt.ph);
+  content.flds.emplace_back("ts", evt.ts);
+}
+
+std::string object(const DurationEvent &evt)
+{
+  Content content;
+
+  fill(content, evt);
+
+  return ::object(content);
+}
+
+std::string object(const CounterEvent &evt)
+{
+  Content content;
+
+  fill(content, evt);
+
+  for (auto it = evt.values.begin(); it != evt.values.end(); ++it)
+  {
+    content.args.emplace_back(it->first, it->second);
+  }
+
+  return ::object(content);
+}
+
+} // namespace
+
+// md table type
+namespace
+{
+
+void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
+{
+  os << "| ";
+  for (auto &key : list)
+  {
+    os << key << " | ";
+  }
+  os << "\n";
+}
+
+struct MDContent
+{
+  std::string name;
+  uint64_t begin_ts;
+  uint64_t end_ts;
+  uint32_t min_rss;
+  uint32_t max_rss;
+  uint32_t min_page_reclaims;
+  uint32_t max_page_reclaims;
+
+  MDContent()
+      : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX),
+        max_page_reclaims(0)
+  {
+    // DO NOTHING
+  }
+
+  virtual ~MDContent() = default;
+
+  void updateRss(uint32_t rss)
+  {
+    if (min_rss == UINT32_MAX)
+      min_rss = rss;
+    if (max_rss == 0)
+      max_rss = rss;
+
+    if (min_rss > rss)
+      min_rss = rss;
+    else if (max_rss < rss)
+      max_rss = rss;
+  }
+
+  void updateMinflt(uint32_t minflt)
+  {
+    if (min_page_reclaims == UINT32_MAX)
+      min_page_reclaims = minflt;
+    if (max_page_reclaims == 0)
+      max_page_reclaims = minflt;
+
+    if (min_page_reclaims > minflt)
+      min_page_reclaims = minflt;
+    else if (max_page_reclaims < minflt)
+      max_page_reclaims = minflt;
+  }
+
+  virtual void write(std::ostream &os) const = 0;
+};
+
+struct OpSeq : public MDContent
+{
+  std::string backend;
+  uint64_t graph_latency;
+
+  struct OpSeqCmp
+  {
+    bool operator()(const OpSeq &lhs, const OpSeq &rhs) const
+    {
+      return lhs.begin_ts < rhs.begin_ts;
+    }
+    bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
+    bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
+  };
+
+  void write(std::ostream &os) const override
+  {
+    uint64_t opseq_latency = end_ts - begin_ts;
+    double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0;
+    writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per),
+                         std::to_string(min_rss), std::to_string(max_rss),
+                         std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)});
+  }
+};
+
+struct Graph : public MDContent
+{
+  std::set<OpSeq, OpSeq::OpSeqCmp> opseqs;
+
+  void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq)
+  {
+    uint64_t graph_latency = end_ts - begin_ts;
+    for (auto it : name_to_opseq)
+    {
+      auto opseq = it.second;
+      opseq.graph_latency = graph_latency;
+
+      opseqs.insert(opseq);
+
+      updateRss(opseq.min_rss);
+      updateRss(opseq.max_rss);
+      updateMinflt(opseq.min_page_reclaims);
+      updateMinflt(opseq.max_page_reclaims);
+    }
+  }
+
+  void write(std::ostream &os) const override
+  {
+    static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)",
+                                                  "page_reclaims_min", "page_reclaims_max"};
+
+    static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------",
+                                                       "-----------------", "-----------------"};
+
+    // Graph's Header
+    writeMDTableRow(os, graph_headers);
+    writeMDTableRow(os, graph_headers_line);
+
+    // Graph's contents
+    writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss),
+                         std::to_string(max_rss), std::to_string(min_page_reclaims),
+                         std::to_string(max_page_reclaims)});
+
+    os << "\n";
+
+    static std::vector<std::string> opseq_headers{
+        "OpSeq name",  "backend",     "latency(us)",       "latency(%)",
+        "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"};
+
+    static std::vector<std::string> opseq_headers_line{
+        "----------", "-------", "-----------",       "-----------",
+        "-------",    "-------", "-----------------", "-----------------"};
+
+    os << "## OpSequences \n";
+
+    // OpSeq's Header
+    writeMDTableRow(os, opseq_headers);
+    writeMDTableRow(os, opseq_headers_line);
+
+    // OpSeq's contents
+    for (auto opseq : opseqs)
+    {
+      opseq.write(os);
+    }
+
+    os << "\n";
+  }
+};
+
+struct MDTableBuilder
+{
+  MDTableBuilder(const std::vector<DurationEvent> &duration_events,
+                 const std::vector<CounterEvent> &counter_events)
+      : _duration_events(duration_events), _counter_events(counter_events)
+  {
+// when ready with low overhead in release build
+#ifdef DEBUG
+    for (const auto &evt : _counter_events)
+    {
+      uint64_t ts = std::stoull(evt.ts);
+      auto &name = evt.name;
+      assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0);
+      assert(evt.values.size() == 1);
+      auto &val = evt.values.begin()->second;
+      if (_ts_to_values.find(ts) == _ts_to_values.end())
+      {
+        std::pair<uint32_t, uint32_t> values;
+        if (name.compare("maxrss") == 0)
+          values.first = std::stoul(val);
+        else
+          values.second = std::stoul(val);
+        _ts_to_values.insert({ts, values});
+      }
+      else
+      {
+        auto &values = _ts_to_values.at(ts);
+        if (name.compare("maxrss") == 0)
+          values.first = std::stoul(val);
+        else
+          values.second = std::stoul(val);
+      }
+    }
+#endif
+  }
+
+  MDTableBuilder &build()
+  {
+    for (auto &it : divideGraph())
+    {
+      size_t begin_idx = it.first;
+      size_t end_idx = it.second;
+      std::map<std::string, OpSeq> name_to_opseq;
+      for (size_t i = begin_idx + 1; i < end_idx; ++i)
+      {
+        const auto &evt = _duration_events[i];
+        assert(evt.name.compare("Graph") != 0);
+        assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0);
+        if (evt.ph.compare("B") == 0)
+        {
+          assert(name_to_opseq.find(evt.name) == name_to_opseq.end());
+          name_to_opseq.insert({evt.name, makeOpSeq(evt)});
+        }
+        else
+        {
+          assert(name_to_opseq.find(evt.name) != name_to_opseq.end());
+          auto &opseq = name_to_opseq.at(evt.name);
+          updateOpSeq(opseq, evt);
+        }
+      }
+
+      _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq));
+    }
+
+    return *this;
+  }
+
+  std::vector<std::pair<size_t, size_t>> divideGraph()
+  {
+    std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx>
+    for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i)
+    {
+      const auto &evt = _duration_events.at(i);
+      if (evt.name.compare("Graph") == 0)
+      {
+        if (evt.ph.compare("B") == 0)
+          begin_idx = i;
+        else
+          graph_idx_list.emplace_back(begin_idx, i);
+      }
+    }
+    return graph_idx_list;
+  }
+
+  OpSeq makeOpSeq(const DurationEvent &evt)
+  {
+    OpSeq opseq;
+    opseq.name = evt.name;
+    opseq.begin_ts = std::stoull(evt.ts);
+    opseq.backend = evt.tid;
+#ifdef DEBUG
+    opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first);
+    opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second);
+#else
+    opseq.updateRss(0);
+    opseq.updateMinflt(0);
+#endif
+    return opseq;
+  }
+
+  void updateOpSeq(OpSeq &opseq, const DurationEvent &evt)
+  {
+    opseq.end_ts = std::stoull(evt.ts);
+#ifdef DEBUG
+    opseq.updateRss(_ts_to_values.at(opseq.end_ts).first);
+    opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second);
+#else
+    opseq.updateRss(0);
+    opseq.updateMinflt(0);
+#endif
+  }
+
+  Graph makeGraph(size_t begin_idx, size_t end_idx,
+                  const std::map<std::string, OpSeq> &name_to_opseq)
+  {
+    Graph graph;
+    graph.name = "Graph";
+    graph.begin_ts = std::stoull(_duration_events[begin_idx].ts);
+    graph.end_ts = std::stoull(_duration_events[end_idx].ts);
+    graph.setOpSeqs(name_to_opseq);
+#ifdef DEBUG
+    graph.updateRss(_ts_to_values.at(graph.begin_ts).first);
+    graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second);
+    graph.updateRss(_ts_to_values.at(graph.end_ts).first);
+    graph.updateMinflt(_ts_to_values.at(graph.end_ts).second);
+#else
+    graph.updateRss(0);
+    graph.updateMinflt(0);
+#endif
+    return graph;
+  }
+
+  void write(std::ostream &os)
+  {
+    // Write contents
+    for (size_t i = 0; i < _graphs.size(); ++i)
+    {
+      os << "# Graph " << i << "\n";
+      _graphs.at(i).write(os);
+    }
+  }
+
+  const std::vector<DurationEvent> &_duration_events;
+  const std::vector<CounterEvent> &_counter_events;
+  // timestamp to std::pair<maxrss, minflt>
+  std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values;
+  std::vector<Graph> _graphs;
+};
+
+} // namespace
+
+EventWriter::EventWriter(const EventRecorder &recorder) : _recorder(recorder)
+{
+  // DO NOTHING
+}
+
+void EventWriter::writeToFiles(const std::string &base_filepath)
+{
+  // Note. According to an internal issue, let snpe json as just file name not '.snpe.json'
+  writeToFile(base_filepath, WriteFormat::SNPE_BENCHMARK);
+  writeToFile(base_filepath + ".chrome.json", WriteFormat::CHROME_TRACING);
+  writeToFile(base_filepath + ".table.md", WriteFormat::MD_TABLE);
+}
+
+void EventWriter::writeToFile(const std::string &filepath, WriteFormat write_format)
+{
+  std::ofstream os{filepath, std::ofstream::out};
+  switch (write_format)
+  {
+    case WriteFormat::CHROME_TRACING:
+      writeChromeTrace(os);
+      break;
+    case WriteFormat::SNPE_BENCHMARK:
+      writeSNPEBenchmark(os);
+      break;
+    case WriteFormat::MD_TABLE:
+      writeMDTable(os);
+      break;
+    default:
+      assert(!"Invalid value");
+      break;
+  }
+}
+
+void EventWriter::writeSNPEBenchmark(std::ostream &os)
+{
+  Json::Value root;
+  auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
+
+  struct Stat
+  {
+    uint64_t sum = 0;
+    uint64_t count = 0;
+    uint64_t max = 0;
+    uint64_t min = std::numeric_limits<uint64_t>::max();
+
+    void accumulate(uint64_t val)
+    {
+      sum += val;
+      count++;
+      max = std::max(max, val);
+      min = std::min(min, val);
+    }
+  };
+
+  // Memory
+  {
+    std::unordered_map<std::string, Stat> mem_stats;
+    for (auto &evt : _recorder.counter_events())
+    {
+      auto &mem_stat = mem_stats[evt.name];
+      uint64_t val = std::stoull(evt.values.at("value"));
+      mem_stat.accumulate(val);
+    }
+
+    auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
+    for (auto &kv : mem_stats)
+    {
+      auto &key = kv.first;
+      auto &val = kv.second;
+      mem[key]["Avg_Size"] = val.sum / val.count;
+      mem[key]["Max_Size"] = val.max;
+      mem[key]["Min_Size"] = val.min;
+      mem[key]["Runtime"] = "NA";
+    }
+  }
+
+  // Operation Execution Time
+  {
+    // NOTE This assumes _duration_events is sorted by "ts" ascending
+
+    // 2D keys : stats[tid][name]
+    std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
+    std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
+    for (auto &evt : _recorder.duration_events())
+    {
+      auto &stat = stats[evt.tid][evt.name];
+      auto &begin_ts = begin_timestamps[evt.tid][evt.name];
+      uint64_t timestamp = std::stoull(evt.ts);
+      if (evt.ph == "B")
+      {
+        if (begin_ts != 0)
+          throw std::runtime_error{"Invalid Data"};
+        begin_ts = timestamp;
+      }
+      else if (evt.ph == "E")
+      {
+        if (begin_ts == 0 || timestamp < begin_ts)
+          throw std::runtime_error{"Invalid Data"};
+        stat.accumulate(timestamp - begin_ts);
+        begin_ts = 0;
+      }
+      else
+        throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
+    }
+
+    for (auto &kv : begin_timestamps)
+      for (auto &kv2 : kv.second)
+        if (kv2.second != 0)
+          throw std::runtime_error{"Invalid Data - B and E pair does not match."};
+
+    for (auto &kv : stats)
+    {
+      auto &tid = kv.first;
+      auto &map = kv.second;
+      auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
+      for (auto &kv : map)
+      {
+        auto &name = kv.first;
+        auto &val = kv.second;
+        json_tid[name]["Avg_Time"] = val.sum / val.count;
+        json_tid[name]["Max_Time"] = val.max;
+        json_tid[name]["Min_Time"] = val.min;
+        json_tid[name]["Runtime"] = tid;
+      }
+    }
+  }
+
+  os << root;
+}
+
+void EventWriter::writeChromeTrace(std::ostream &os)
+{
+  os << "{\n";
+  os << "  " << quote("traceEvents") << ": [\n";
+
+  for (auto &evt : _recorder.duration_events())
+  {
+    os << "    " << object(evt) << ",\n";
+  }
+
+  for (auto &evt : _recorder.counter_events())
+  {
+    os << "    " << object(evt) << ",\n";
+  }
+
+  os << "    { }\n";
+  os << "  ]\n";
+  os << "}\n";
+}
+
+void EventWriter::writeMDTable(std::ostream &os)
+{
+  MDTableBuilder(_recorder.duration_events(), _recorder.counter_events()).build().write(os);
+}
diff --git a/runtime/onert/core/src/util/EventWriter.h b/runtime/onert/core/src/util/EventWriter.h
new file mode 100644
index 000000000..7e838ca82
--- /dev/null
+++ b/runtime/onert/core/src/util/EventWriter.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_UTIL_EVENT_WRITER_H__
+#define __ONERT_UTIL_EVENT_WRITER_H__
+
+#include "EventRecorder.h"
+
+#include <string>
+#include <ostream>
+
+class EventWriter
+{
+public:
+  enum class WriteFormat
+  {
+    CHROME_TRACING,
+    SNPE_BENCHMARK,
+    MD_TABLE,
+  };
+
+public:
+  EventWriter(const EventRecorder &recorder);
+
+public:
+  void writeToFiles(const std::string &base_filepath);
+  void writeToFile(const std::string &filepath, WriteFormat write_format);
+
+private:
+  void writeSNPEBenchmark(std::ostream &os);
+  void writeChromeTrace(std::ostream &os);
+  void writeMDTable(std::ostream &os);
+
+private:
+  const EventRecorder &_recorder;
+};
+
+#endif // __ONERT_UTIL_EVENT_WRITER_H__
diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc
index 95c15049d..0278df4d2 100644
--- a/runtime/onert/core/src/util/ShapeInference.cc
+++ b/runtime/onert/core/src/util/ShapeInference.cc
@@ -22,6 +22,7 @@
 #include "util/logging.h"
 
 #include <cassert>
+#include <numeric>
 #include <sstream>
 #include <cmath>
 
@@ -72,6 +73,19 @@ ir::Shape broadcastShapes(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape
 
 } // namespace
 
+namespace bcq
+{
+inline int getOutputSize(const ir::Shape &cluster_shape, const int32_t *cluster_buf)
+{
+  int size = 0;
+  for (int idx = 0; idx < cluster_shape.dim(0); idx++)
+  {
+    size += cluster_buf[idx * 2 + 1];
+  }
+  return size;
+}
+} // namespace bcq
+
 //
 // Shape inference
 //
@@ -116,6 +130,11 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha
 
 ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank)
 {
+  if (axis < 0 || axis >= rank)
+  {
+    throw std::runtime_error("ArgMax shape inference: Wrong axis value " + std::to_string(axis));
+  }
+
   ir::Shape out_shape;
   for (int idx = 0; idx < rank; ++idx)
   {
@@ -259,19 +278,24 @@ ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs
   return output_shape;
 }
 
-ir::Shape inferBroadcastToShape(const ir::Shape wshape, const int32_t *shape_buffer)
+/*
+ * shp_shape : SHAPE input tensor's shape
+ * shp_buf : SHAPE input tensor's buffer
+ */
+ir::Shape inferBroadcastToShape(const ir::Shape shp_shape, const int32_t *shp_buf)
 {
-  const int num_elements = wshape.num_elements();
+
+  const int num_elements = shp_shape.num_elements();
 
   assert(num_elements != 0);
-  assert(shape_buffer);
+  assert(shp_buf);
 
   ir::Shape new_shape(num_elements);
 
   for (int i = 0; i < num_elements; ++i)
   {
-    assert(shape_buffer[i] != 0); // It shouldn't be 0.
-    new_shape.dim(i) = shape_buffer[i];
+    assert(shp_buf[i] != 0); // It shouldn't be 0.
+    new_shape.dim(i) = shp_buf[i];
   }
 
   return new_shape;
@@ -305,6 +329,9 @@ ir::Shape inferConcatShape(const Shapes &in_shapes, const ir::operation::Concat:
 ir::Shape inferConv2DShape(const ir::Shape &in_shape, const ir::Shape &ker_shape,
                            const ir::operation::Conv2D::Param &param, ir::Layout layout)
 {
+  if (param.stride.horizontal == 0 || param.stride.vertical == 0)
+    throw std::runtime_error{"Conv2D: stride values must be positive"};
+
   auto ifm_shape = in_shape.asFeature(layout);
 
   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]
@@ -321,6 +348,9 @@ ir::Shape inferDepthwiseConv2DShape(const ir::Shape &in_shape, const ir::Shape &
                                     const ir::operation::DepthwiseConv2D::Param &param,
                                     ir::Layout layout)
 {
+  if (param.stride.horizontal == 0 || param.stride.vertical == 0)
+    throw std::runtime_error{"DepthwiseConv2D: stride values must be positive"};
+
   assert(layout == ir::Layout::NHWC);
   auto ifm_shape = in_shape.asFeature(layout);
 
@@ -354,13 +384,13 @@ ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis)
   return out_shape;
 }
 
-ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *buffer)
+ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf)
 {
   ir::Shape out_shape(in_shape.dim(0));
 
   for (int out_x = 0; out_x < out_shape.rank(); ++out_x)
   {
-    out_shape.dim(out_x) = buffer[out_x];
+    out_shape.dim(out_x) = in_buf[out_x];
   }
 
   return out_shape;
@@ -380,11 +410,60 @@ ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &k
   return {ir::Shape({static_cast<int32_t>(batch_size), num_units})};
 }
 
+ir::Shape inferBCQFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &cluster_shape,
+                                      const int32_t *cluster_buf)
+{
+  assert(cluster_shape.rank() == 2);
+  assert(cluster_shape.dim(1) == 2);
+
+  const auto input_size = in_shape.dim(1);
+  const auto output_size = bcq::getOutputSize(cluster_shape, cluster_buf);
+
+  return {ir::Shape({output_size, input_size})};
+}
+
+ir::Shape inferBCQGatherShape(const ir::Shape &indices_shape, const ir::Shape &cluster_shape,
+                              const int32_t *cluster_buf, int rank,
+                              const ir::operation::BCQGather::Param &param)
+{
+  ir::Shape out_shape;
+  ir::Shape in_original_shape;
+
+  assert(cluster_shape.rank() == 2);
+  assert(cluster_shape.dim(1) == 2);
+
+  auto hidden_size = param.input_hidden_size;
+  auto axis = param.axis;
+
+  in_original_shape.append(bcq::getOutputSize(cluster_shape, cluster_buf));
+  in_original_shape.append(hidden_size);
+
+  const int indices_rank = indices_shape.rank();
+  for (int idx = 0; idx < rank; ++idx)
+  {
+    if (idx == (int)axis)
+    {
+      for (int indices_idx = 0; indices_idx < indices_rank; indices_idx++)
+      {
+        out_shape.append(indices_shape.dim(indices_idx));
+      }
+    }
+    else
+    {
+      out_shape.append(in_original_shape.dim(idx));
+    }
+  }
+
+  return out_shape;
+}
+
 ir::Shape inferGatherShape(const ir::Shape &input_shape, const ir::Shape &indices_shape, int axis,
                            int rank)
 {
   ir::Shape out_shape;
+
   const int indices_rank = indices_shape.rank();
+
   for (int idx = 0; idx < rank; ++idx)
   {
     if (idx == axis)
@@ -470,6 +549,9 @@ ir::Shape inferPadShape(const ir::Shape &in_shape, const int32_t *pad_buf, const
 ir::Shape inferPoolShape(const ir::Shape &in_shape, const ir::operation::Pool2D::Param &param,
                          const ir::Layout layout)
 {
+  if (param.stride.horizontal == 0 || param.stride.vertical == 0)
+    throw std::runtime_error{"Pool2D: stride values must be positive"};
+
   assert(layout == ir::Layout::NHWC);
   auto ifm_shape = in_shape.asFeature(layout);
   const auto out_h_w = calcConvLikeHeightAndWidth(ifm_shape.H, ifm_shape.W, param.kh, param.kw,
@@ -482,6 +564,17 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp
                                    const int32_t output_width)
 {
   assert(in_shape.rank() == 4);
+  if (output_height < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_height = " +
+                             std::to_string(output_height)};
+  }
+  if (output_width < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_width = " +
+                             std::to_string(output_width)};
+  }
+
   ir::Shape ret(in_shape.rank());
 
   ret.dim(0) = in_shape.dim(0);
@@ -613,7 +706,8 @@ ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &i
   return new_shape;
 }
 
-ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, const int32_t *sizes)
+ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
+                          const int32_t *sizes_buf)
 {
   const uint32_t rank = input_shape.rank();
   ir::Shape out_shape(rank);
@@ -623,12 +717,12 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, c
     const auto input_dim = input_shape.dim(idx);
 
     // begin is zero-based
-    auto begin = begins[idx];
+    auto begin = begins_buf[idx];
     if (begin < 0)
       throw std::runtime_error("shape inference Slice: Invalid begin.");
 
     // size is one-based
-    auto size = sizes[idx];
+    auto size = sizes_buf[idx];
     if (size < -1)
       throw std::runtime_error("shape inference Slice: Invalid size.");
 
@@ -648,8 +742,8 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, c
 }
 
 ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape,
-                                   const ir::Shape &padding_shape, const int32_t *block_shape_data,
-                                   const int32_t *padding_data)
+                                   const ir::Shape &padding_shape, const int32_t *block_shape_buf,
+                                   const int32_t *padding_buf)
 {
   const uint32_t rank = input_shape.rank();
   ir::Shape out_shape(rank);
@@ -677,14 +771,14 @@ ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim)
   {
     int final_dim_size =
-        (input_shape.dim(dim + 1) + padding_data[dim * 2] + padding_data[dim * 2 + 1]);
+        (input_shape.dim(dim + 1) + padding_buf[dim * 2] + padding_buf[dim * 2 + 1]);
 
-    assert(final_dim_size % block_shape_data[dim] == 0);
+    assert(final_dim_size % block_shape_buf[dim] == 0);
 
-    out_shape.dim(dim + 1) = final_dim_size / block_shape_data[dim];
+    out_shape.dim(dim + 1) = final_dim_size / block_shape_buf[dim];
   }
 
-  const int output_batch_size = input_shape.dim(0) * block_shape_data[0] * block_shape_data[1];
+  const int output_batch_size = input_shape.dim(0) * block_shape_buf[0] * block_shape_buf[1];
   const int output_channel_size = input_shape.dim(3);
 
   out_shape.dim(0) = output_batch_size;
@@ -948,35 +1042,71 @@ ir::Shape inferStridedSliceShape(const ir::Shape &input_shape, const StridedSlic
   return out_shape;
 }
 
-ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier)
+ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_buf,
+                         const int32_t multiplier_size)
 {
-  // assert(in_shape.rank() == multiplier.rank());
+  if (multiplier_size != in_shape.rank())
+  {
+    throw std::runtime_error("inferTileShape failed, input rank: " +
+                             std::to_string(in_shape.rank()) + ", bad multipliers size: " +
+                             std::to_string(multiplier_size) + "");
+  }
   ir::Shape new_Shape(in_shape.rank());
 
   for (int i = 0; i < in_shape.rank(); ++i)
   {
-    assert(multiplier[i]); // multiplier[i] shuld not be 0.
-    new_Shape.dim(i) = in_shape.dim(i) * multiplier[i];
+    assert(multiplier_buf[i]); // multiplier_buf[i] shuld not be 0.
+    new_Shape.dim(i) = in_shape.dim(i) * multiplier_buf[i];
   }
   return new_Shape;
 }
 
-ir::Shape inferTransposeShape(const ir::Shape &in_shape, const std::vector<int> &perm)
+ir::Shape inferTransposeShape(const ir::Shape &in_shape, const int32_t *perm_buf,
+                              const int32_t perm_size)
 {
-  if (static_cast<int>(perm.size()) > in_shape.rank())
+  const auto rank = in_shape.rank();
+  if (perm_size > rank)
+  {
+    throw std::runtime_error("inferTransposeShape failed, bad permutation size: " +
+                             std::to_string(perm_size));
+  }
+
+  const int32_t *perm_data = perm_buf;
+  std::vector<int32_t> regular_perm_vec;
+  if (perm_size == 0)
+  {
+    // perm_data will be set to (n-1...0)
+    regular_perm_vec.resize(rank);
+    std::iota(regular_perm_vec.begin(), regular_perm_vec.end(), 0);
+    std::reverse(regular_perm_vec.begin(), regular_perm_vec.end());
+    perm_data = regular_perm_vec.data();
+  }
+  else
   {
-    throw std::runtime_error("inferTransposeShape failed, bad rank size: " +
-                             std::to_string(static_cast<int>(perm.size())));
+    assert(rank == perm_size);
   }
-  ir::Shape out_shape(static_cast<int>(perm.size()));
-  for (int idx = 0; idx < static_cast<int>(perm.size()); idx++)
+
+  ir::Shape out_shape(rank);
+  std::vector<bool> visit_perms(rank, false);
+  for (int idx = 0; idx < rank; idx++)
   {
-    if (perm[idx] < 0 || perm[idx] >= static_cast<int>(perm.size()))
+    const auto perm_val = perm_data[idx];
+    // Check invalid permutation value
+    if (perm_val < 0 || perm_val >= rank)
     {
-      throw std::runtime_error("inferTransposeShape failed, bad perm value: " +
-                               std::to_string(perm[idx]));
+      throw std::runtime_error("inferTransposeShape failed, bad permutation value: " +
+                               std::to_string(perm_val));
     }
-    out_shape.dim(idx) = in_shape.dim(perm[idx]);
+
+    // Check duplicated permutation value
+    if (visit_perms.at(perm_val))
+    {
+      throw std::runtime_error("inferTransposeShape failed, duplicated permutation value: " +
+                               std::to_string(perm_val));
+    }
+    visit_perms.at(perm_val) = true;
+
+    out_shape.dim(idx) = in_shape.dim(perm_val);
   }
   return out_shape;
 }
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index 480452e01..d21001e59 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -38,7 +39,7 @@ namespace onert
 namespace base_loader
 {
 
-template <typename LoaderDomain, typename SpecificLoader> class BaseLoader
+template <typename LoaderDomain> class BaseLoader
 {
 protected:
   using Verifier = typename LoaderDomain::Verifier;
@@ -69,6 +70,7 @@ public:
   explicit BaseLoader(std::unique_ptr<ir::Subgraphs> &subgs)
       : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr}
   {
+    _use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA);
   }
 
   /**
@@ -93,7 +95,6 @@ protected:
   ir::Activation convertActivation(ActivationFunctionType type);
   ir::DataType tensorTypeToDataType(TensorType type);
   ir::OperandIndex tensorIdxToOperandIdx(int32_t tensorIdx);
-  void deallocateMmappedArea(uint8_t *ptr, size_t size);
 
   // Create operands form tflite::Tensor
   ir::OperandIndex loadOperand(const Tensor *tensor, ir::Graph &subg);
@@ -107,7 +108,11 @@ protected:
   // Load Pool2D param
   template <typename Param> void loadPool2DOptions(Param &param, const Pool2DOptions *options);
 
+private:
+  virtual std::unique_ptr<ir::Graph> loadSubgraph(const SubGraph *subg) = 0;
   // Operations
+  template <typename OpIR, typename... Args>
+  const OpIR *loadOperationTo(const Operator *op, ir::Graph &subg, Args &&... args);
   void loadConv2D(const Operator *op, ir::Graph &subg);
   void loadDepthwiseConv2D(const Operator *op, ir::Graph &subg);
   void loadTransposeConv(const Operator *op, ir::Graph &subg);
@@ -115,62 +120,50 @@ protected:
   void loadReshape(const Operator *op, ir::Graph &subg);
   void loadSoftmax(const Operator *op, ir::Graph &subg);
   void loadConcatenation(const Operator *op, ir::Graph &subg);
-  void loadFill(const Operator *op, ir::Graph &subg);
   void loadFC(const Operator *op, ir::Graph &subg);
-  template <ir::operation::BinaryArithmetic::ArithmeticType op_type>
-  void loadBinaryArithmetic(const Operator *op, ir::Graph &subg);
+  void loadBinaryArithmetic(const Operator *op, ir::Graph &subg,
+                            ir::operation::BinaryArithmetic::ArithmeticType op_type);
   void loadAddV2(const Operator *op, ir::Graph &subg);
   void loadPack(const Operator *op, ir::Graph &subg);
   void loadResizeBilinear(const Operator *op, ir::Graph &subg);
   void loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg);
-  void loadSelect(const Operator *op, ir::Graph &subg);
-  void loadSquaredDifference(const Operator *op, ir::Graph &subg);
-  void loadTranspose(const Operator *op, ir::Graph &subg);
-  template <ir::operation::Reduce::ReduceType reduce_type>
-  void loadReduce(const Operator *op, ir::Graph &subg);
+  void loadReduce(const Operator *op, ir::Graph &subg,
+                  ir::operation::Reduce::ReduceType reduce_type);
   void loadReduceAll(const Operator *op, ir::Graph &subg);
-  void loadReverseV2(const Operator *op, ir::Graph &subg);
-  void loadPad(const Operator *op, ir::Graph &subg);
   void loadElementwiseActivation(const Operator *op, ir::Graph &subg,
                                  ir::operation::ElementwiseActivation::Type op_type,
                                  float alpha = 0.f, float beta = 0.f);
-  template <ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type>
-  void loadElementwiseBinary(const Operator *op, ir::Graph &subg);
+  void loadElementwiseBinary(const Operator *op, ir::Graph &subg,
+                             ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type);
   void loadElementwiseUnary(const Operator *op, ir::Graph &subg,
                             ir::operation::ElementwiseUnary::Type op_type);
-  void loadExpandDims(const Operator *op, ir::Graph &subg);
   void loadGather(const Operator *op, ir::Graph &subg);
   void loadCustom(const Operator *op, ir::Graph &subg);
-  void loadSpaceToBatchND(const Operator *op, ir::Graph &subg);
   void loadBatchMatMul(const Operator *op, ir::Graph &subg);
-  void loadBatchToSpaceND(const Operator *op, ir::Graph &subg);
   void loadSqueeze(const Operator *op, ir::Graph &subg);
-  void loadPrelu(const Operator *op, ir::Graph &subg);
   void loadSplit(const Operator *op, ir::Graph &subg);
   void loadSplitV(const Operator *op, ir::Graph &subg);
-  void loadSlice(const Operator *op, ir::Graph &subg);
   void loadStridedSlice(const Operator *op, ir::Graph &subg);
   void loadUnpack(const Operator *op, ir::Graph &subg);
   void loadComparison(const Operator *op, ir::Graph &subg);
   void loadEinsum(const Operator *op, ir::Graph &subg);
   void loadOneHot(const Operator *op, ir::Graph &subg);
-  void loadShape(const Operator *op, ir::Graph &subg);
   void loadIf(const Operator *op, ir::Graph &subg);
   void loadWhile(const Operator *op, ir::Graph &subg);
   void loadArgMax(const Operator *op, ir::Graph &subg);
-  void loadPow(const Operator *op, ir::Graph &subg);
-  void loadTile(const Operator *op, ir::Graph &subg);
-  void loadRange(const Operator *op, ir::Graph &subg);
-  void loadRank(const Operator *op, ir::Graph &subg);
-  void loadMatrixBandPart(const Operator *op, ir::Graph &subg);
-  void loadBroadcastTo(const Operator *op, ir::Graph &subg);
   void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
   void loadLogSoftmax(const Operator *op, ir::Graph &subg);
   void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
-  void loadStatelessRandomUniform(const Operator *op, ir::Graph &subg);
-  void loadL2Normalization(const Operator *op, ir::Graph &subg);
   void loadLeakyRelu(const Operator *op, ir::Graph &subg);
 
+  void verifySubgraphIndex(int subg_index)
+  {
+    const auto num_subgraphs = _model->subgraphs()->size();
+    if (subg_index < 0 || subg_index >= static_cast<int32_t>(num_subgraphs))
+      throw std::runtime_error{std::string{"Invalid subgraph index - "} +
+                               std::to_string(subg_index)};
+  }
+
 protected:
   // Base address for mapped region for loading (if needed)
   uint8_t *_base;
@@ -186,10 +179,12 @@ protected:
   std::unordered_map<ir::OperandIndex, std::string> _tensor_names;
   // Verifier
   std::unique_ptr<Verifier> _verifier;
+  // Boolean flag to use MMAPED_DATA
+  bool _use_mmaped_data = false;
 };
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const char *file_path)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::BaseLoader::loadFromFile(const char *file_path)
 {
   _fd = open(file_path, O_RDONLY);
   if (_fd < 0)
@@ -216,22 +211,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const ch
   _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size);
 
   loadModel();
+  munmap(_base, size);
 
   close(_fd);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromBuffer(uint8_t *buffer,
-                                                                          size_t size)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::BaseLoader::loadFromBuffer(uint8_t *buffer, size_t size)
 {
   _base = buffer;
   _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size);
   loadModel();
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActivation(
-    const ActivationFunctionType type)
+template <typename LoaderDomain>
+ir::Activation
+BaseLoader<LoaderDomain>::BaseLoader::convertActivation(const ActivationFunctionType type)
 {
   switch (type)
   {
@@ -246,14 +241,13 @@ ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActi
     case ActivationFunctionType::ActivationFunctionType_TANH:
       return ir::Activation::TANH;
     default:
-      throw std::runtime_error(std::string("Unsupported activation type: ")
-                                   .append(EnumNameActivationFunctionType(type)));
+      throw std::runtime_error(std::string("Unsupported or invalid activation type: ") +
+                               std::to_string(static_cast<int>(type)));
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::DataType
-BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorTypeToDataType(const TensorType type)
+template <typename LoaderDomain>
+ir::DataType BaseLoader<LoaderDomain>::BaseLoader::tensorTypeToDataType(const TensorType type)
 {
   switch (type)
   {
@@ -275,39 +269,13 @@ BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorTypeToDataType(const
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::OperandIndex
-BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorIdxToOperandIdx(int32_t tensorIdx)
+template <typename LoaderDomain>
+ir::OperandIndex BaseLoader<LoaderDomain>::BaseLoader::tensorIdxToOperandIdx(int32_t tensorIdx)
 {
   return isOptionalInputTensor(tensorIdx) ? ir::OperandIndex() : _tensor_to_operand[tensorIdx];
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::deallocateMmappedArea(uint8_t *ptr,
-                                                                                 size_t size)
-{
-  // Calculate offset from base address of mapped region
-  ptrdiff_t unaligned_offset_start = ptr - _base;
-  ptrdiff_t unaligned_offset_end = unaligned_offset_start + size;
-
-  // Calculated aligned offset from base address of mapped region
-  // munmap accepts memory address which is a multiple of the pagesize
-  ptrdiff_t aligned_offset_start =
-      ((unaligned_offset_start + (_pagesize - 1)) / _pagesize) * _pagesize;
-  ptrdiff_t aligned_offset_end = (unaligned_offset_end / _pagesize) * _pagesize;
-
-  ptrdiff_t area_size = aligned_offset_end - aligned_offset_start;
-  if (area_size > 0)
-  {
-    // Unmap mapped region for CachedData
-    if (munmap(_base + aligned_offset_start, area_size) == -1)
-    {
-      VERBOSE(BASE_LOADER) << "munmap failed" << std::endl;
-    }
-  }
-}
-
-/* Copied from tensorflow lite. Need to append copyright */
+/* Copy is copied from tensorflow lite */
 template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr)
 {
   if (data_ptr->values() == nullptr)
@@ -324,9 +292,8 @@ template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr)
   return true;
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Tensor *tensor,
-                                                                       ir::Graph &subg)
+template <typename LoaderDomain>
+ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir::Graph &subg)
 {
   ir::Shape shape;
   // Shape
@@ -386,18 +353,44 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
   {
     std::vector<uint16_t> w1_segments;
     std::vector<uint16_t> w1_indices;
-    // ignore traversal_order, block_map
+    // check traversal_order
+    if (src_sparsity->traversal_order())
+    {
+      const int traversal_order_size = src_sparsity->traversal_order()->size();
+      for (int i = 0; i < traversal_order_size; ++i)
+      {
+        if (i != src_sparsity->traversal_order()->Get(i))
+          throw std::runtime_error("traversal_order [0, 1, ..., n-1] is only supported.");
+      }
+    }
+    // check block_map
+    int block_rank = 0;
+    if (src_sparsity->block_map())
+    {
+      block_rank = src_sparsity->block_map()->size();
+      for (int i = 0; i < block_rank; ++i)
+      {
+        if (i != src_sparsity->block_map()->Get(i))
+          throw std::runtime_error("block_map [0, 1, ..., n-1] is only supported.");
+      }
+    }
     // load metadata
-    const size_t dim_metadata_size = src_sparsity->dim_metadata()->size();
-    if (dim_metadata_size != 2)
-      throw std::runtime_error("sparse tensor is supported only for 2D");
+    const int dim_metadata_size = src_sparsity->dim_metadata()->size();
+    auto dense_rank = shape.rank();
+    if (dense_rank + block_rank != dim_metadata_size)
+      throw std::runtime_error("sparsity dim_metadata length is wrong.");
+    bool random_sparsity = dim_metadata_size == 2 && block_rank == 0;
+    bool block2D_sparsity = dim_metadata_size == 4 && block_rank == 2;
+    if (dim_metadata_size != !random_sparsity && !block2D_sparsity)
+      throw std::runtime_error(
+          "sparsity is supported only for 2D tensor with random or 16x1 block sparsity.");
+
     const auto *src_metadata = src_sparsity->dim_metadata()->Get(0);
     if (src_metadata->format() != DimensionType::DimensionType_DENSE)
       throw std::runtime_error("sparse tensor dim[0] is not DENSE");
     src_metadata = src_sparsity->dim_metadata()->Get(1);
     if (src_metadata->format() != DimensionType::DimensionType_SPARSE_CSR)
       throw std::runtime_error("sparse tensor dim[0] is not SPARSE_CSR");
-
     auto ParseSparseIndexVector = [src_metadata, &w1_segments, &w1_indices]() {
       if (src_metadata->array_segments() == nullptr || src_metadata->array_indices() == nullptr)
         return false;
@@ -433,7 +426,17 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
     };
     if (ParseSparseIndexVector() == false)
       throw std::runtime_error("Error during parsing sparsity index information");
-    type_info.sparse2DMetadata(std::move(w1_segments), std::move(w1_indices));
+    // Get block size
+    std::vector<int32_t> block_size;
+    for (int i = 0; i < block_rank; ++i)
+    {
+      auto block_metadata = src_sparsity->dim_metadata()->Get(dense_rank + i);
+      if (block_metadata->format() != DimensionType::DimensionType_DENSE)
+        throw std::runtime_error("block dimension must be DENSE.");
+      block_size.push_back(block_metadata->dense_size());
+    }
+    type_info.sparsity(std::make_shared<ir::Sparsity>(std::move(w1_segments), std::move(w1_indices),
+                                                      std::move(block_size)));
   }
   // Create operand
   const auto operand_index = subg.addOperand(shape, type_info);
@@ -450,8 +453,28 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
     }
     else // Model is loaded(mmap'd) from a file
     {
-      data_obj = std::make_unique<ir::CachedData>(data->data(), data->size());
-      deallocateMmappedArea(const_cast<uint8_t *>(data->data()), data->size());
+      size_t data_size = data->size();
+      ptrdiff_t unaligned_offset_start = data->data() - _base;
+      ptrdiff_t offset_end = unaligned_offset_start + data_size;
+
+      // Calculated aligned offset from base address of mapped region
+      // munmap accepts memory address which is a multiple of the pagesize
+      ptrdiff_t aligned_offset_start = (unaligned_offset_start / _pagesize) * _pagesize;
+      size_t mmap_size = offset_end - aligned_offset_start;
+
+      if (_use_mmaped_data)
+      {
+        data_obj = std::make_unique<ir::MMapedData>(_fd, aligned_offset_start, mmap_size,
+                                                    unaligned_offset_start, data_size);
+      }
+      else
+      {
+        size_t offset = unaligned_offset_start - aligned_offset_start;
+        uint8_t *mmap_base = static_cast<uint8_t *>(
+            mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, _fd, aligned_offset_start));
+        data_obj = std::make_unique<ir::CachedData>(mmap_base + offset, data_size);
+        munmap(mmap_base, mmap_size);
+      }
     }
     subg.setOperandValue(operand_index, std::move(data_obj));
   }
@@ -465,10 +488,9 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
   return operand_index;
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadOperationIO(const Operator *op,
-                                                               ir::OperandIndexSequence &inputs,
-                                                               ir::OperandIndexSequence &outputs)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadOperationIO(const Operator *op, ir::OperandIndexSequence &inputs,
+                                               ir::OperandIndexSequence &outputs)
 {
   for (const std::int32_t idx : *op->inputs())
   {
@@ -490,120 +512,116 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperationIO(const Operator *o
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
+template <typename LoaderDomain>
 template <typename Param, typename OptionsType>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadStridesAndPaddings(Param &param,
-                                                                      const OptionsType *options)
+void BaseLoader<LoaderDomain>::loadStridesAndPaddings(Param &param, const OptionsType *options)
 {
   // Strides
   param.stride.vertical = options->stride_h();
   param.stride.horizontal = options->stride_w();
   // Paddings
-  if (options->padding() == Padding::Padding_SAME)
-    param.padding.type = ir::PaddingType::SAME;
-  if (options->padding() == Padding::Padding_VALID)
-    param.padding.type = ir::PaddingType::VALID;
+  switch (options->padding())
+  {
+    case Padding::Padding_SAME:
+      param.padding.type = ir::PaddingType::SAME;
+      break;
+    case Padding::Padding_VALID:
+      param.padding.type = ir::PaddingType::VALID;
+      break;
+    default:
+      throw std::runtime_error{"Invalid padding type"};
+  }
   // param paddings indexes unused
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
+template <typename LoaderDomain>
 template <typename Param>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPool2DOptions(Param &param,
-                                                                 const Pool2DOptions *options)
+void BaseLoader<LoaderDomain>::loadPool2DOptions(Param &param, const Pool2DOptions *options)
 {
   // Strides and Paddings
+  if (options->stride_h() <= 0 || options->stride_w() <= 0)
+    throw std::runtime_error{"Invalid stride vertical or horizontal - both must be bigger than 0"};
   loadStridesAndPaddings(param, options);
   // Filter width and height
   // Strides
+  if (options->filter_width() <= 0 || options->filter_height() <= 0)
+    throw std::runtime_error{"Invalid filter width or height - both must be bigger than 0"};
   param.kw = options->filter_width();
   param.kh = options->filter_height();
   // Activation
   param.activation = convertActivation(options->fused_activation_function());
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadConv2D(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+template <typename OpIR, typename... Args>
+const OpIR *BaseLoader<LoaderDomain>::loadOperationTo(const Operator *op, ir::Graph &subg,
+                                                      Args &&... args)
 {
+  static_assert(sizeof...(args) <= 1, "You can't have more than 1 arguments!");
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
 
   loadOperationIO(op, inputs, outputs);
 
+  std::unique_ptr<OpIR> new_op(new OpIR(inputs, outputs, std::forward<Args>(args)...));
+  auto ret = new_op.get();
+  subg.addOperation(std::move(new_op));
+
+  return ret;
+}
+
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadConv2D(const Operator *op, ir::Graph &subg)
+{
   ir::operation::Conv2D::Param param;
   const auto *options = op->builtin_options_as_Conv2DOptions();
   param.activation = convertActivation(options->fused_activation_function());
   loadStridesAndPaddings(param, options);
-
   param.dilation.width_factor = options->dilation_w_factor();
   param.dilation.height_factor = options->dilation_h_factor();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Conv2D(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Conv2D>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadDepthwiseConv2D(const Operator *op,
-                                                                   ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadDepthwiseConv2D(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::DepthwiseConv2D::Param param;
   const auto *options = op->builtin_options_as_DepthwiseConv2DOptions();
   param.activation = convertActivation(options->fused_activation_function());
   loadStridesAndPaddings(param, options);
-  // Multiplier
   param.multiplier = options->depth_multiplier();
   // Dilation h/w factor unused
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::DepthwiseConv2D(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+
+  loadOperationTo<ir::operation::DepthwiseConv2D>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadTransposeConv(const Operator *op,
-                                                                 ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadTransposeConv(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::TransposeConv::Param param;
   const auto *options = op->builtin_options_as_TransposeConvOptions();
   loadStridesAndPaddings(param, options);
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::TransposeConv(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+
+  loadOperationTo<ir::operation::TransposeConv>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPool2D(const Operator *op, ir::Graph &subg,
-                                                          ir::operation::Pool2D::PoolType op_type)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadPool2D(const Operator *op, ir::Graph &subg,
+                                          ir::operation::Pool2D::PoolType op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Pool2D::Param param;
   param.op_type = op_type;
   const auto *options = op->builtin_options_as_Pool2DOptions();
 
   loadPool2DOptions(param, options);
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pool2D(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Pool2D>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReshape(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadReshape(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Reshape::Param param{};
   const auto *options = op->builtin_options_as_ReshapeOptions();
   if (options != nullptr)
@@ -611,99 +629,64 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadReshape(const Operator *op, i
     const auto *new_shape = options->new_shape();
     if (new_shape)
     {
-      for (uint i = 0; i < new_shape->Length(); ++i)
+      for (uint i = 0; i < new_shape->size(); ++i)
       {
         param.new_shape.push_back(new_shape->Get(i));
       }
     }
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reshape(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Reshape>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSoftmax(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSoftmax(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Softmax::Param param;
   const auto *options = op->builtin_options_as_SoftmaxOptions();
   // Beta
   param.beta = options->beta();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Softmax(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Softmax>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadConcatenation(const Operator *op,
-                                                                 ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadConcatenation(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Concat::Param param;
   const auto *options = op->builtin_options_as_ConcatenationOptions();
   // Axis
   param.axis = options->axis();
   // activation unused
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Concat(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Concat>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadFill(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadFC(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Fill(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
+  ir::operation::FullyConnected::Param param;
+  const auto *options = op->builtin_options_as_FullyConnectedOptions();
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadFC(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
+  param.activation = convertActivation(options->fused_activation_function());
+  // weights_format unused
 
-  loadOperationIO(op, inputs, outputs);
+  const auto fc = loadOperationTo<ir::operation::FullyConnected>(op, subg, param);
 
-  const auto &input_operand = subg.operands().at(inputs.at(ir::operation::FullyConnected::INPUT));
-  auto &weights_operand = subg.operands().at(inputs.at(ir::operation::FullyConnected::WEIGHT));
+  const auto &input_operand =
+      subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::INPUT));
+  auto &weights_operand =
+      subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::WEIGHT));
   if (input_operand.typeInfo().type() == ir::DataType::FLOAT32 &&
       weights_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM)
   {
     weights_operand.type(ir::DataType::QUANT_INT8_SYMM);
   }
-
-  ir::operation::FullyConnected::Param param;
-  const auto *options = op->builtin_options_as_FullyConnectedOptions();
-
-  param.activation = convertActivation(options->fused_activation_function());
-  // weights_format unused
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::FullyConnected(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadAddV2(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadAddV2(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::BinaryArithmetic::Param param;
   param.arithmetic_type = ir::operation::BinaryArithmetic::ArithmeticType::ADD;
 
@@ -722,21 +705,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadAddV2(const Operator *op, ir:
     param.activation = convertActivation(fused_activation_func);
   }
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::BinaryArithmetic(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::BinaryArithmetic>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-template <ir::operation::BinaryArithmetic::ArithmeticType op_type>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBinaryArithmetic(const Operator *op,
-                                                                    ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadBinaryArithmetic(
+    const Operator *op, ir::Graph &subg, ir::operation::BinaryArithmetic::ArithmeticType op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::BinaryArithmetic::Param param;
   param.arithmetic_type = op_type;
   switch (op_type)
@@ -771,172 +746,66 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBinaryArithmetic(const Operat
       break;
   }
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::BinaryArithmetic(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::BinaryArithmetic>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPack(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadPack(const Operator *op, ir::Graph &subg)
 {
-  // This runtime_error will be removed if the one of backend supports this operation
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Pack::Param param;
   const auto *options = op->builtin_options_as_PackOptions();
   param.num = options->values_count();
   param.axis = options->axis();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pack(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Pack>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseActivation(
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadElementwiseActivation(
     const Operator *op, ir::Graph &subg, ir::operation::ElementwiseActivation::Type op_type,
     float alpha, float beta)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::ElementwiseActivation::Param param;
   param.op_type = op_type;
   param.alpha = alpha;
   param.beta = beta;
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ElementwiseActivation(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ElementwiseActivation>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeBilinear(const Operator *op,
-                                                                  ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadResizeBilinear(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto size = inputs.at(1);
-
-  // FIXME Handle ResizeBilinearOptions.
-  if (!subg.operands().at(size).isConstant())
-    throw std::runtime_error("ResizeBilinear: non-constant 'size' is not supported.");
-
-  std::vector<std::int32_t> size_v = subg.operands().at(size).template asVector<std::int32_t>();
-
   ir::operation::ResizeBilinear::Param param;
-  param.height_out = size_v[0];
-  param.width_out = size_v[1];
   param.align_corners = op->builtin_options_as_ResizeBilinearOptions()->align_corners();
   param.half_pixel_centers = op->builtin_options_as_ResizeBilinearOptions()->half_pixel_centers();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::ResizeBilinear({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ResizeBilinear>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeNearestNeighbor(const Operator *op,
-                                                                         ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto size = inputs.at(1);
-
-  if (!subg.operands().at(size).isConstant())
-    throw std::runtime_error("ResizeNearestNeighbor: non-constant 'size' is not supported.");
-
-  std::vector<std::int32_t> size_v = subg.operands().at(size).template asVector<std::int32_t>();
-
   ir::operation::ResizeNearestNeighbor::Param param;
-  param.height_out = size_v[0];
-  param.width_out = size_v[1];
   param.align_corners = op->builtin_options_as_ResizeNearestNeighborOptions()->align_corners();
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ResizeNearestNeighbor({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSelect(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Select(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ResizeNearestNeighbor>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSquaredDifference(const Operator *op,
-                                                                     ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadReduce(const Operator *op, ir::Graph &subg,
+                                          ir::operation::Reduce::ReduceType reduce_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::SquaredDifference(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadTranspose(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto perm = inputs.at(1);
-
-  if (!subg.operands().at(perm).isConstant())
-    throw std::runtime_error("Transpose: non-constant 'perm' is not supported.");
-
-  ir::operation::Transpose::Param param;
-  param.perm = subg.operands().at(perm).template asVector<int>();
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Transpose({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-template <ir::operation::Reduce::ReduceType reduce_type>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReduce(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Reduce::Param param;
   param.reduce_type = reduce_type;
   param.keep_dims = op->builtin_options_as_ReducerOptions()->keep_dims();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reduce(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Reduce>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReduceAll(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadReduceAll(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Reduce::Param param;
   param.reduce_type = ir::operation::Reduce::ReduceType::ALL;
   if (op->custom_options() == nullptr)
@@ -952,64 +821,28 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadReduceAll(const Operator *op,
     param.keep_dims = attr_map["keep_dims"].AsBool();
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reduce(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReverseV2(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reverse(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPad(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pad(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Reduce>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-template <ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseBinary(const Operator *op,
-                                                                     ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadElementwiseBinary(
+    const Operator *op, ir::Graph &subg,
+    ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::ElementwiseBinary::Param param;
   param.op_type = op_type;
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ElementwiseBinary(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ElementwiseBinary>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseUnary(
-    const Operator *op, ir::Graph &subg, ir::operation::ElementwiseUnary::Type op_type)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadElementwiseUnary(const Operator *op, ir::Graph &subg,
+                                                    ir::operation::ElementwiseUnary::Type op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::ElementwiseUnary::Param param;
   param.op_type = op_type;
 
+  const auto eu = loadOperationTo<ir::operation::ElementwiseUnary>(op, subg, param);
   if (op_type == ir::operation::ElementwiseUnary::Type::CAST)
   {
     auto qasymm8ToUint8 = [](ir::Operand &operand) {
@@ -1018,61 +851,24 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseUnary(
         operand.type(ir::DataType::UINT8);
       }
     };
-    qasymm8ToUint8(subg.operands().at(inputs.at(ir::operation::ElementwiseUnary::Input::INPUT)));
-    qasymm8ToUint8(subg.operands().at(outputs.at(0)));
+    qasymm8ToUint8(
+        subg.operands().at(eu->getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)));
+    qasymm8ToUint8(subg.operands().at(eu->getOutputs().at(0)));
   }
-
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ElementwiseUnary(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadExpandDims(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::ExpandDims(inputs, outputs));
-  subg.addOperation(std::move(new_op));
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadGather(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadGather(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::Gather::Param param;
   param.axis = op->builtin_options_as_GatherOptions()->axis();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Gather(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToBatchND(const Operator *op,
-                                                                  ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::SpaceToBatchND{inputs, outputs}};
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Gather>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchMatMul(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadBatchMatMul(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::BatchMatMul::Param param;
 
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
@@ -1105,89 +901,21 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchMatMul(const Operator *o
           " as " + EnumNameBuiltinOperator(BuiltinOperator::BuiltinOperator_BATCH_MATMUL));
   }
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchMatMul{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchToSpaceND(const Operator *op,
-                                                                  ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchToSpaceND{inputs, outputs}};
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadMatrixBandPart(const Operator *op,
-                                                                  ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::MatrixBandPart(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::BatchMatMul>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
   ir::operation::SpaceToDepth::Param param;
-
   const auto *options = op->builtin_options_as_SpaceToDepthOptions();
-
   param.block_size = options->block_size();
 
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadStatelessRandomUniform(const Operator *op,
-                                                                          ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::StatelessRandomUniform(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::SpaceToDepth>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadRank(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Rank(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
@@ -1237,7 +965,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
         loadReduceAll(op, subg);
         break;
       case BuiltinOP::MatrixBandPart:
-        loadMatrixBandPart(op, subg);
+        loadOperationTo<ir::operation::MatrixBandPart>(op, subg);
         break;
       case BuiltinOP::BatchMatMul:
         loadBatchMatMul(op, subg);
@@ -1246,13 +974,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
         loadEinsum(op, subg);
         break;
       case BuiltinOP::BroadcastTo:
-        loadBroadcastTo(op, subg);
+        loadOperationTo<ir::operation::BroadcastTo>(op, subg);
         break;
       case BuiltinOP::FusedBatchNorm:
         loadFusedBatchNorm(op, subg);
         break;
       case BuiltinOP::StatelessRandomUniform:
-        loadStatelessRandomUniform(op, subg);
+        loadOperationTo<ir::operation::StatelessRandomUniform>(op, subg);
         break;
       case BuiltinOP::Erf:
         loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ERF);
@@ -1285,141 +1013,71 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSqueeze(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSqueeze(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  ir::operation::Squeeze::Param param{};
+  ir::operation::Squeeze::Param param;
   const auto *options = op->builtin_options_as_SqueezeOptions();
   const auto *dims = options->squeeze_dims();
   if (dims)
   {
-    if (dims->Length() > sizeof(param.dims) / sizeof(param.dims[0]))
+    if (dims->size() > sizeof(param.dims) / sizeof(param.dims[0]))
       throw std::runtime_error("Squeeze: 'param.ndims' is out of range.");
-    param.ndim = dims->Length();
+    param.ndim = dims->size();
     for (int i = 0; i < param.ndim; ++i)
       param.dims[i] = dims->Get(i);
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Squeeze(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Squeeze>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPrelu(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSplit(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::PReLU(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSplit(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  // Notice : input order is strange for tflite split
-  auto input = inputs.at(1);
-  auto axis = inputs.at(0);
-
-  // FIXME Handle SplitOptions.
-  if (!subg.operands().at(axis).isConstant())
-    throw std::runtime_error("Split: non-constant 'axis' is not supported.");
-
-  ir::operation::Split::Param param{};
-  param.axis = subg.operands().at(axis).template asScalar<int>();
+  ir::operation::Split::Param param;
   const auto *options = op->builtin_options_as_SplitOptions();
   param.num_splits = options->num_splits();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Split({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Split>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSplitV(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSplitV(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  ir::operation::SplitV::Param param{};
-
+  ir::operation::SplitV::Param param;
   const auto *options = op->builtin_options_as_SplitVOptions();
   param.num_splits = options->num_splits();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::SplitV(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSlice(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::Slice{inputs, outputs}};
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::SplitV>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadStridedSlice(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadStridedSlice(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::StridedSlice::Param param;
-
   const auto *options = op->builtin_options_as_StridedSliceOptions();
   param.begin_mask = options->begin_mask();
   param.end_mask = options->end_mask();
   param.shrink_axis_mask = options->shrink_axis_mask();
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::StridedSlice{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::StridedSlice>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadUnpack(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadUnpack(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Unpack::Param param;
   const auto *options = op->builtin_options_as_UnpackOptions();
   param.num = options->num();
   param.axis = options->axis();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Unpack(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Unpack>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadComparison(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadComparison(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Comparison::Param param;
-
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
 
   switch (builtin_op)
@@ -1447,24 +1105,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadComparison(const Operator *op
           std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Comparison(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Comparison>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadEinsum(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadEinsum(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::Einsum::Param param;
-
-  if (inputs.size() != 2)
-  {
-    throw std::runtime_error{"Einsum: NYI input - only support two inputs"};
-  }
-
   if (op->custom_options() == nullptr)
   {
     throw std::runtime_error{"Einsum: empty equation"};
@@ -1478,24 +1125,16 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadEinsum(const Operator *op, ir
     param.equation = attr_map["equation"].ToString();
   }
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::Einsum{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
+  const auto es = loadOperationTo<ir::operation::Einsum>(op, subg, param);
+  if (es->getInputs().size() != 2)
+  {
+    throw std::runtime_error{"Einsum: NYI input - only support two inputs"};
+  }
 }
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadFusedBatchNorm(const Operator *op,
-                                                                  ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadFusedBatchNorm(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::FusedBatchNorm::Param param;
-
-  if (inputs.size() != 5)
-  {
-    throw std::runtime_error{"FusedBatchNorm: NYI input - only support five inputs"};
-  }
-
   if (op->custom_options() == nullptr)
   {
     throw std::runtime_error{"FusedBatchNorm: empty option"};
@@ -1511,195 +1150,104 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadFusedBatchNorm(const Operator
     param.data_format = attr_map["data_format"].ToString();
   }
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::FusedBatchNorm{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
+  const auto fbn = loadOperationTo<ir::operation::FusedBatchNorm>(op, subg, param);
+
+  if (fbn->getInputs().size() != 5)
+  {
+    throw std::runtime_error{"FusedBatchNorm: NYI input - only support five inputs"};
+  }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadOneHot(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadOneHot(const Operator *op, ir::Graph &subg)
 {
   if (op->inputs()->size() != 4 || op->outputs()->size() != 1)
     throw std::runtime_error("OneHot Op has wrong number of input or output tensors.");
 
-  // Set input and output tensors
-  ir::OperandIndexSequence inputs, outputs;
-  loadOperationIO(op, inputs, outputs);
-
   // Set parameter
-  const auto axis = op->builtin_options_as_OneHotOptions()->axis();
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::OneHot(inputs, outputs, {axis}));
-  subg.addOperation(std::move(new_op));
-}
+  ir::operation::OneHot::Param param;
+  param.axis = op->builtin_options_as_OneHotOptions()->axis();
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadShape(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  // ir::operation::Shape::Param param;
-  // const auto *options = op->builtin_options_as_ShapeOptions();
-  // param.out_type = tensorTypeToDataType(options->out_type());
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Shape(inputs, outputs /*, param*/));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::OneHot>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadIf(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadIf(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
+  const auto *options = op->builtin_options_as_IfOptions();
+  const int32_t then_index = options->then_subgraph_index();
+  const int32_t else_index = options->else_subgraph_index();
 
-  loadOperationIO(op, inputs, outputs);
+  verifySubgraphIndex(then_index);
+  verifySubgraphIndex(else_index);
 
   ir::operation::If::Param param;
-  const auto *options = op->builtin_options_as_IfOptions();
-  const uint32_t then_index = options->then_subgraph_index();
-  const uint32_t else_index = options->else_subgraph_index();
-  param.then_subg_index = ir::SubgraphIndex{then_index};
-  param.else_subg_index = ir::SubgraphIndex{else_index};
+  param.then_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(then_index)};
+  param.else_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(else_index)};
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::If(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::If>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadWhile(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadWhile(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
+  const auto *options = op->builtin_options_as_WhileOptions();
+  const int32_t cond_index = options->cond_subgraph_index();
+  const int32_t body_index = options->body_subgraph_index();
 
-  loadOperationIO(op, inputs, outputs);
+  verifySubgraphIndex(cond_index);
+  verifySubgraphIndex(body_index);
 
   ir::operation::While::Param param;
-  const auto *options = op->builtin_options_as_WhileOptions();
-  const uint32_t cond_index = options->cond_subgraph_index();
-  const uint32_t body_index = options->body_subgraph_index();
-  param.cond_subg_index = ir::SubgraphIndex{cond_index};
-  param.body_subg_index = ir::SubgraphIndex{body_index};
+  param.cond_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(cond_index)};
+  param.body_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(body_index)};
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::While(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::While>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadArgMax(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadArgMax(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  auto inputOperand = subg.operands().at(inputs.at(0));
-  auto axisOperand = subg.operands().at(inputs.at(1));
-
-  if (!axisOperand.isConstant())
-    throw std::runtime_error("ArgMax: non-constant 'axis' is not supported.");
-  if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 ||
-                                           axisOperand.typeInfo().type() == ir::DataType::INT64)))
-    throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported.");
-
   ir::operation::ArgMax::Param param;
-  param.axis = axisOperand.template asVector<int>()[0];
   const auto output_type = op->builtin_options_as_ArgMaxOptions()->output_type();
   switch (output_type)
   {
     case TensorType::TensorType_INT32:
     case TensorType::TensorType_INT64:
+      param.output_type = tensorTypeToDataType(output_type);
       break;
     default:
       throw std::runtime_error("ArgMax: `output_type` must be either int32 or int64.");
   }
-  param.output_type = tensorTypeToDataType(output_type);
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::ArgMax(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
+  auto am = loadOperationTo<ir::operation::ArgMax>(op, subg, param);
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPow(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pow(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadRange(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Range(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadTile(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  auto multiples = inputs.at(ir::operation::Tile::MULTIPLES);
-
-  if (!subg.operands().at(multiples).isConstant())
-    throw std::runtime_error("Tile: non-constant 'multiples' is not supported.");
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Tile(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  auto &axisOperand = subg.operands().at(am->getInputs().at(ir::operation::ArgMax::Input::AXIS));
+  if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 ||
+                                           axisOperand.typeInfo().type() == ir::DataType::INT64)))
+    throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported.");
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadLogSoftmax(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::LogSoftmax::Param param;
-
   // In tflite, beta is fixed to 1.0 and axis is fixed to -1.
   param.beta = 1.0f;
   param.axis = -1;
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::LogSoftmax(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadL2Normalization(const Operator *op,
-                                                                   ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::L2Normalization(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::LogSoftmax>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadLeakyRelu(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadLeakyRelu(const Operator *op, ir::Graph &subg)
 {
   float alpha = op->builtin_options_as_LeakyReluOptions()->alpha();
   loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::LEAKY_RELU, alpha,
                             1.f);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg)
 {
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
 
@@ -1733,16 +1281,16 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadFC(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_ADD:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::ADD>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::ADD);
       return;
     case BuiltinOperator::BuiltinOperator_SUB:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::SUB>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::SUB);
       return;
     case BuiltinOperator::BuiltinOperator_MUL:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::MUL>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::MUL);
       return;
     case BuiltinOperator::BuiltinOperator_DIV:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::DIV>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::DIV);
       return;
     case BuiltinOperator::BuiltinOperator_PACK:
       loadPack(op, subg);
@@ -1769,40 +1317,37 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::RSQRT);
       return;
     case BuiltinOperator::BuiltinOperator_SELECT:
-      loadSelect(op, subg);
-      return;
     case BuiltinOperator::BuiltinOperator_SELECT_V2:
-      // Use same loader with BuiltinOperator_SELECT
-      loadSelect(op, subg);
+      loadOperationTo<ir::operation::Select>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SQRT:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SQRT);
       return;
     case BuiltinOperator::BuiltinOperator_SQUARED_DIFFERENCE:
-      loadSquaredDifference(op, subg);
+      loadOperationTo<ir::operation::SquaredDifference>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_TANH:
       loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::TANH, 1.f,
                                 1.f);
       return;
     case BuiltinOperator::BuiltinOperator_TRANSPOSE:
-      loadTranspose(op, subg);
+      loadOperationTo<ir::operation::Transpose>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_MEAN:
-      loadReduce<ir::operation::Reduce::ReduceType::MEAN>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::MEAN);
       return;
     case BuiltinOperator::BuiltinOperator_REDUCE_ANY:
-      loadReduce<ir::operation::Reduce::ReduceType::ANY>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::ANY);
       return;
     case BuiltinOperator::BuiltinOperator_REDUCE_MAX:
-      loadReduce<ir::operation::Reduce::ReduceType::MAX>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::MAX);
       return;
     case BuiltinOperator::BuiltinOperator_REVERSE_V2:
-      loadReverseV2(op, subg);
+      loadOperationTo<ir::operation::Reverse>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_PAD:
     case BuiltinOperator::BuiltinOperator_PADV2:
-      loadPad(op, subg);
+      loadOperationTo<ir::operation::Pad>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_LOGISTIC:
       loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::LOGISTIC);
@@ -1811,19 +1356,19 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::EXP);
       return;
     case BuiltinOperator::BuiltinOperator_EXPAND_DIMS:
-      loadExpandDims(op, subg);
+      loadOperationTo<ir::operation::ExpandDims>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_GATHER:
       loadGather(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SPACE_TO_BATCH_ND:
-      loadSpaceToBatchND(op, subg);
+      loadOperationTo<ir::operation::SpaceToBatchND>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_BATCH_TO_SPACE_ND:
-      loadBatchToSpaceND(op, subg);
+      loadOperationTo<ir::operation::BatchToSpaceND>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SUM:
-      loadReduce<ir::operation::Reduce::ReduceType::SUM>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::SUM);
       return;
     case BuiltinOperator::BuiltinOperator_CUSTOM:
       loadCustom(op, subg);
@@ -1832,7 +1377,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadSqueeze(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_PRELU:
-      loadPrelu(op, subg);
+      loadOperationTo<ir::operation::PReLU>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SPLIT:
       loadSplit(op, subg);
@@ -1841,7 +1386,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadSplitV(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SLICE:
-      loadSlice(op, subg);
+      loadOperationTo<ir::operation::Slice>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_STRIDED_SLICE:
       loadStridedSlice(op, subg);
@@ -1850,10 +1395,10 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadUnpack(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_MINIMUM:
-      loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN>(op, subg);
+      loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN);
       return;
     case BuiltinOperator::BuiltinOperator_MAXIMUM:
-      loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX>(op, subg);
+      loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX);
       return;
     case BuiltinOperator::BuiltinOperator_CAST:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::CAST);
@@ -1879,10 +1424,10 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SIN);
       return;
     case BuiltinOperator::BuiltinOperator_SHAPE:
-      loadShape(op, subg);
+      loadOperationTo<ir::operation::Shape>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_REDUCE_PROD:
-      loadReduce<ir::operation::Reduce::ReduceType::PROD>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::PROD);
       return;
     case BuiltinOperator::BuiltinOperator_IF:
       loadIf(op, subg);
@@ -1903,26 +1448,26 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ROUND);
       return;
     case BuiltinOperator::BuiltinOperator_POW:
-      loadPow(op, subg);
+      loadOperationTo<ir::operation::Pow>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_LOGICAL_NOT:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::LOGICAL_NOT);
       return;
     case BuiltinOperator::BuiltinOperator_LOGICAL_OR:
-      loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR>(
-          op, subg);
+      loadElementwiseBinary(op, subg,
+                            ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR);
       return;
     case BuiltinOperator::BuiltinOperator_FILL:
-      loadFill(op, subg);
+      loadOperationTo<ir::operation::Fill>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_ZEROS_LIKE:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ZEROS_LIKE);
       return;
     case BuiltinOperator::BuiltinOperator_TILE:
-      loadTile(op, subg);
+      loadOperationTo<ir::operation::Tile>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_RANGE:
-      loadRange(op, subg);
+      loadOperationTo<ir::operation::Range>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_BATCH_MATMUL:
       loadBatchMatMul(op, subg);
@@ -1937,13 +1482,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadSpaceToDepth(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_L2_NORMALIZATION:
-      loadL2Normalization(op, subg);
+      loadOperationTo<ir::operation::L2Normalization>(op, subg);
       break;
     case BuiltinOperator::BuiltinOperator_LEAKY_RELU:
       loadLeakyRelu(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_RANK:
-      loadRank(op, subg);
+      loadOperationTo<ir::operation::Rank>(op, subg);
       return;
     default:
       throw std::runtime_error(
@@ -1951,8 +1496,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadModel()
+template <typename LoaderDomain> void BaseLoader<LoaderDomain>::loadModel()
 {
   LoaderDomain::VerifyModelBuffer(*_verifier.get());
   _model = LoaderDomain::GetModel(_base);
@@ -1967,8 +1511,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadModel()
   auto subgraphs = std::make_unique<ir::Subgraphs>();
   for (uint32_t subgraph_index = 0; subgraph_index < domain_subgraphs->size(); ++subgraph_index)
   {
-    auto subg =
-        static_cast<SpecificLoader *>(this)->loadSubgraph((*_model->subgraphs())[subgraph_index]);
+    auto subg = loadSubgraph((*_model->subgraphs())[subgraph_index]);
     subgraphs->push(ir::SubgraphIndex{subgraph_index}, std::move(subg));
   }
   _subgraphs = std::move(subgraphs);
diff --git a/runtime/onert/frontend/circle/CMakeLists.txt b/runtime/onert/frontend/circle/CMakeLists.txt
index 8bcf85dd3..76dca9989 100644
--- a/runtime/onert/frontend/circle/CMakeLists.txt
+++ b/runtime/onert/frontend/circle/CMakeLists.txt
@@ -8,7 +8,7 @@ add_library(circle_loader SHARED ${CIRCLE_LOADER_SOURCES})
 
 target_include_directories(circle_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
-target_link_libraries(circle_loader PUBLIC onert_core)
+target_link_libraries(circle_loader PRIVATE onert_core)
 target_link_libraries(circle_loader PRIVATE base_loader nnfw_common nnfw_coverage)
 target_link_libraries(circle_loader PRIVATE circle_schema)
 
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 92a9ee7a5..4565ffc00 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -69,7 +69,7 @@ struct LoaderDomain
   static bool VerifyModelBuffer(Verifier &verifier) { return circle::VerifyModelBuffer(verifier); }
 };
 
-class CircleLoader final : public base_loader::BaseLoader<LoaderDomain, CircleLoader>
+class CircleLoader final : public base_loader::BaseLoader<LoaderDomain>
 {
 protected:
   void loadInstanceNorm(const Operator *op, ir::Graph &subg);
@@ -91,7 +91,8 @@ public:
     }
   }
 
-  std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg)
+private:
+  std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg) override
   {
     auto subg = std::make_unique<ir::Graph>();
     // Load tensors
diff --git a/runtime/onert/frontend/nnapi/execution.cc b/runtime/onert/frontend/nnapi/execution.cc
index ce7da579e..56ca5ef00 100644
--- a/runtime/onert/frontend/nnapi/execution.cc
+++ b/runtime/onert/frontend/nnapi/execution.cc
@@ -94,12 +94,36 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution *execution, int32
 
   // Omitted optional input
   // LSTM operation's some inputs can be optional input
+  // Transpose operation's permutation input can be optional input
   if ((buffer == nullptr) && (length == 0))
   {
+    uint32_t dims[1] = {0};
+    ANeuralNetworksOperandType compared_shape;
+    compared_shape.dimensionCount = 1;
+    compared_shape.dimensions = dims;
     if (execution->hasUnspecifiedDims(operand_index))
     {
       return ANEURALNETWORKS_NO_ERROR;
     }
+    else if (type == nullptr && execution->IsOptionalInput(operand_index))
+    {
+      if (!execution->setOptionalInput(index, type, buffer, length))
+      {
+        VERBOSE(NNAPI::Execution) << "setInput: Fail to set optional input" << std::endl;
+        return ANEURALNETWORKS_BAD_DATA;
+      }
+      return ANEURALNETWORKS_NO_ERROR;
+    }
+    // TODO Changes the condition to check zero sized
+    else if (execution->compareShape(&compared_shape, operand_index))
+    {
+      if (!execution->setInput(index, type, buffer, length))
+      {
+        VERBOSE(NNAPI::Execution) << "setInput: Fail to set input" << std::endl;
+        return ANEURALNETWORKS_BAD_DATA;
+      }
+      return ANEURALNETWORKS_NO_ERROR;
+    }
     else
     {
       VERBOSE(NNAPI::Execution) << "setInput: Cannot handle fully-specified shape on model build "
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
index eb12d7e76..6114b74b0 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
@@ -98,6 +98,17 @@ bool ANeuralNetworksExecution::compareShape(const ANeuralNetworksOperandType *ty
   return operand_shape == shape_from_type;
 }
 
+bool ANeuralNetworksExecution::IsOptionalInput(const onert::ir::OperandIndex index) noexcept
+{
+  const auto &operand_shape = _execution->primary_subgraph().operands().at(index).shape();
+  for (int32_t i = 0; i < operand_shape.rank(); ++i)
+  {
+    if (operand_shape.dim(i) != 0)
+      return false;
+  }
+  return true;
+}
+
 bool ANeuralNetworksExecution::hasUnspecifiedDims(const onert::ir::OperandIndex index) noexcept
 {
   const auto operand_shape = _execution->primary_subgraph().operands().at(index).shape();
@@ -148,6 +159,45 @@ bool ANeuralNetworksExecution::setInput(uint32_t index, const ANeuralNetworksOpe
   return true;
 }
 
+bool ANeuralNetworksExecution::setOptionalInput(uint32_t index,
+                                                const ANeuralNetworksOperandType *type,
+                                                const void *buffer, size_t length) noexcept
+{
+  assert(type == nullptr);
+  assert(buffer == nullptr);
+  assert(length == 0);
+  try
+  {
+    onert::ir::IOIndex input_index{index};
+    const auto operand_index = getInputOperandIndex(index);
+
+    const auto type_info = _execution->primary_subgraph().operands().at(operand_index).typeInfo();
+    const auto shape = (type != nullptr)
+                           ? NNAPIConvert::getShape(type)
+                           : _execution->primary_subgraph().operands().at(operand_index).shape();
+
+    // ANeuralNetworksExecution::setInput() uses only shape information
+    ANeuralNetworksOperandType optional_input_type;
+    optional_input_type.dimensionCount = shape.rank();
+    std::vector<uint32_t> dims(optional_input_type.dimensionCount);
+    for (uint32_t i = 0; i < optional_input_type.dimensionCount; ++i)
+    {
+      dims.at(i) = shape.dim(i);
+    }
+    optional_input_type.dimensions = dims.data();
+
+    return setInput(index, &optional_input_type, buffer, length);
+  }
+  catch (const std::exception &e)
+  {
+    VERBOSE(EXCEPTION) << e.what() << std::endl;
+
+    return false;
+  }
+
+  return true;
+}
+
 bool ANeuralNetworksExecution::setOutput(uint32_t index, const ANeuralNetworksOperandType *type,
                                          void *buffer, size_t length) noexcept
 {
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
index 848ae743f..1f4b868f6 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
@@ -35,6 +35,8 @@ public:
 public:
   bool setInput(uint32_t index, const ANeuralNetworksOperandType *type, const void *buffer,
                 size_t length) noexcept;
+  bool setOptionalInput(uint32_t index, const ANeuralNetworksOperandType *type, const void *buffer,
+                        size_t length) noexcept;
   bool setOutput(uint32_t index, const ANeuralNetworksOperandType *type, void *buffer,
                  size_t length) noexcept;
   bool startExecute(void) noexcept;
@@ -46,6 +48,7 @@ public:
                        const onert::ir::OperandIndex index) noexcept;
   bool compareShape(const ANeuralNetworksOperandType *type,
                     const onert::ir::OperandIndex index) noexcept;
+  bool IsOptionalInput(const onert::ir::OperandIndex index) noexcept;
   bool hasUnspecifiedDims(const onert::ir::OperandIndex index) noexcept;
   size_t getOperandSize(const onert::ir::OperandIndex index) noexcept;
   const std::shared_ptr<onert::exec::Execution> instance(void) noexcept;
diff --git a/runtime/onert/frontend/nnapi/ANeuralNetworksModel.test.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.test.cc
index 15a279a7e..bb42f2b08 100644
--- a/runtime/onert/frontend/nnapi/ANeuralNetworksModel.test.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.test.cc
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 
-#include "wrapper/ANeuralNetworksModel.h"
+#include "ANeuralNetworksModel.h"
 
-TEST(MODEL, model_build)
+TEST(MODEL, neg_model_build)
 {
   ANeuralNetworksModel model;
-  ASSERT_EQ(model.isFinished(), false);
+  ASSERT_FALSE(model.isFinished());
 }
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
index 8e3d83db4..e6c38f5f8 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
@@ -708,31 +708,7 @@ OperationFactory::OperationFactory()
     return new operation::StridedSlice{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_TRANSPOSE] = [](const OperationFactory::Param &init_param,
-                                       Operands &operands) {
-    // TODO make this work with init_param.input_count == 1 (when permutation vector is optional)
-
-    // Inputs
-    // 0: An n-D tensor, specifying the tensor to be transposed.
-    // 1: An optional 1-D Tensor of {@link ANEURALNETWORKS_TENSOR_INT32},
-    //    the permutation of the dimensions of the input tensor.
-    //    The returned tensor's dimension i corresponds to the input dimension
-    //    perm[i]. If perm is not given, it is set to (n-1...0), where n is the
-    //    rank of the input tensor. Hence by default, this operation performs a
-    //    regular matrix transpose on 2-D input Tensors.
-    assert(init_param.input_count == 2);
-    assert(init_param.output_count == 1);
-
-    OperandIndexSequence inputs{init_param.inputs[0]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-    std::vector<std::int32_t> perm =
-        operands.at(OperandIndex{init_param.inputs[1]}).asVector<std::int32_t>();
-
-    operation::Transpose::Param param;
-    param.perm.assign(perm.cbegin(), perm.cend());
-
-    return new operation::Transpose{inputs, outputs, param};
-  };
+  _map[ANEURALNETWORKS_TRANSPOSE] = createSimpleBinaryOp<operation::Transpose>;
 
   _map[ANEURALNETWORKS_MUL] =
       getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::MUL);
@@ -982,6 +958,28 @@ OperationFactory::OperationFactory()
     return new operation::ResizeBilinear{inputs, outputs, param};
   };
 
+  _map[ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR] = [](const OperationFactory::Param &init_param,
+                                                     Operands &operands) {
+    assert((init_param.input_count == 3 || init_param.input_count == 4) &&
+           init_param.output_count == 1);
+
+    OperandIndexSequence outputs{init_param.outputs[0]};
+
+    // Each input should be interpreted as follows:
+    //
+    //  0 -> IFM Index
+    //  1 -> Height Index
+    //  2 -> Width Index
+    OperandIndexSequence inputs{init_param.inputs[0]};
+
+    operation::ResizeNearestNeighbor::Param param;
+    param.height_out = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<int32_t>();
+    param.width_out = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<int32_t>();
+    param.align_corners = false;
+    // The layout input is not supported yet
+    return new operation::ResizeNearestNeighbor{inputs, outputs, param};
+  };
+
   _map[ANEURALNETWORKS_RELU1] = getElementwiseActivationGenerator(
       onert::ir::operation::ElementwiseActivation::Type::RELU, 1.f, -1.f);
 
@@ -1304,6 +1302,105 @@ OperationFactory::OperationFactory()
     }
     param.cell_threshold = operands.at(OperandIndex{init_param.inputs[21]}).asScalar<float>();
     param.projection_threshold = operands.at(OperandIndex{init_param.inputs[22]}).asScalar<float>();
+    // This is initialization to prevent warning or error by static code analyzer. LSTM operation
+    // does not need time_major
+    param.time_major = false;
+
+    return new operation::LSTM{inputs, outputs, param};
+  };
+
+  _map[ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM] = [](const OperationFactory::Param &init_param,
+                                                          Operands &operands) {
+    assert((init_param.input_count >= 24 || init_param.input_count <= 28) &&
+           (init_param.output_count >= 1 && init_param.output_count <= 3));
+
+    // Each input should be interpreted as follows:
+    //
+    // 0 -> Input Tensor Index
+    // 1 -> Input to Input Tensor Index
+    // 2 -> Input to Forget Tensor Index
+    // 3 -> Input to Cell Tensor Index
+    // 4 -> Input to Output Tensor Index
+    // 5 -> Recurrent to Input Weights Tensor Index
+    // 6 -> Recurrent to Forget Weights Tensor Index
+    // 7 -> Recurrent to Cell Weights Tensor Index
+    // 8 -> Recurrent to Output Weights Tensor Index
+    // 9 -> Cell to Input Weights Tensor Index
+    // 10 -> Cell to Forget Weights Tensor Index
+    // 11 -> Cell to Output Weights Tensor Index
+    // 12 -> Input Gate Bias Tensor Index
+    // 13 -> Forget Gate Bias Tensor Index
+    // 14 -> Cell Bias Tensor Index
+    // 15 -> Output Gate Bias Tensor Index
+    // 16 -> Projection Weights Tensor Index
+    // 17 -> Projection Bias Tensor Index
+    // 18 -> Output State In Tensor Index
+    // 19 -> Cell State In Tensor Index
+    assert(init_param.input_count - 3 > 20);
+    OperandIndexSequence inputs;
+    for (uint32_t n = 0; n < 20; ++n)
+    {
+      inputs.append(OperandIndex{init_param.inputs[n]});
+    }
+
+    // 24 -> Input Layer Normalization Weights Tensor Index
+    // 25 -> Forget Layer Normalization Weights Tensor Index
+    // 26 -> Cell Layer Normalization Weights Tensor Index
+    // 27 -> Output Layer Normalization Weights Tensor Index
+    if (init_param.input_count > 24)
+    {
+      for (uint32_t n = 24; n < 28; ++n)
+      {
+        if (init_param.input_count > n)
+        {
+          inputs.append(OperandIndex{init_param.inputs[n]});
+        }
+      }
+    }
+
+    // Each output should be interpreted as follows:
+    //
+    // 0 -> Output Tensor Index -> 3
+    // 1 -> Output State Out Tensor Index
+    // 2 -> Cell State Out Tensor Index
+    const OperandIndex scratch_buffer_index;
+    OperandIndex output_state_index =
+        init_param.output_count >= 2 ? OperandIndex{init_param.outputs[1]} : OperandIndex();
+    OperandIndex cell_state_index =
+        init_param.output_count >= 3 ? OperandIndex{init_param.outputs[2]} : OperandIndex();
+    const OperandIndex output_index = OperandIndex{init_param.outputs[0]};
+    OperandIndexSequence outputs{scratch_buffer_index, output_state_index, cell_state_index,
+                                 output_index};
+
+    operation::LSTM::Param param;
+    const auto activation_index = OperandIndex{init_param.inputs[20]};
+    switch (operands.at(activation_index).asScalar<int32_t>())
+    {
+      case 0:
+        param.activation = Activation::NONE;
+        break;
+      case 1:
+        param.activation = Activation::RELU;
+        break;
+      case 2:
+        param.activation = Activation::RELU1;
+        break;
+      case 3:
+        param.activation = Activation::RELU6;
+        break;
+      case 4:
+        param.activation = Activation::TANH;
+        break;
+      case 6:
+        param.activation = Activation::SIGMOID;
+        break;
+      default:
+        throw std::runtime_error("Unsupported activation type");
+        break;
+    }
+    param.cell_threshold = operands.at(OperandIndex{init_param.inputs[21]}).asScalar<float>();
+    param.projection_threshold = operands.at(OperandIndex{init_param.inputs[22]}).asScalar<float>();
+    param.time_major = operands.at(OperandIndex{init_param.inputs[23]}).asScalar<bool>();
 
     return new operation::LSTM{inputs, outputs, param};
   };
@@ -1406,7 +1503,7 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_ABS_EX
   _map[ANEURALNETWORKS_ABS_EX] = _map[ANEURALNETWORKS_ABS];
 
-  _map[ANEURALNETWORKS_ARGMAX] = [](const OperationFactory::Param &init_param, Operands &operands) {
+  _map[ANEURALNETWORKS_ARGMAX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
 
     OperandIndexSequence outputs{init_param.outputs[0]};
@@ -1415,10 +1512,9 @@ OperationFactory::OperationFactory()
     //
     //  0 -> Input Tensor Index
     //  1 -> Axis Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
+    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
 
     operation::ArgMax::Param param;
-    param.axis = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<std::int32_t>();
     // NNAPI ARGMAX output type is always int32
     param.output_type = DataType::INT32;
 
@@ -1517,7 +1613,7 @@ OperationFactory::OperationFactory()
     assert(init_param.input_count == 3);
     assert(init_param.output_count >= 1); // At least one output tensor and axis
 
-    OperandIndexSequence inputs{init_param.inputs[0]};
+    OperandIndexSequence inputs{init_param.inputs[1], init_param.inputs[0]};
     OperandIndexSequence outputs;
     for (uint32_t n = 0; n < init_param.output_count; ++n)
     {
@@ -1525,7 +1621,6 @@ OperationFactory::OperationFactory()
     }
 
     operation::Split::Param param;
-    param.axis = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<std::int32_t>();
     param.num_splits = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<std::int32_t>();
 
     return new operation::Split{inputs, outputs, param};
diff --git a/runtime/onert/frontend/tflite/CMakeLists.txt b/runtime/onert/frontend/tflite/CMakeLists.txt
index fcadf5223..604a9e4cb 100644
--- a/runtime/onert/frontend/tflite/CMakeLists.txt
+++ b/runtime/onert/frontend/tflite/CMakeLists.txt
@@ -8,7 +8,7 @@ add_library(tflite_loader SHARED ${TFLITE_LOADER_SOURCES})
 
 target_include_directories(tflite_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
-target_link_libraries(tflite_loader PUBLIC onert_core)
+target_link_libraries(tflite_loader PRIVATE onert_core)
 target_link_libraries(tflite_loader PRIVATE base_loader nnfw_common nnfw_coverage)
 
 install(TARGETS tflite_loader DESTINATION lib)
diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc
index 7eef15717..fe4295ada 100644
--- a/runtime/onert/frontend/tflite/src/tflite_loader.cc
+++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc
@@ -62,7 +62,7 @@ struct LoaderDomain
   }
 };
 
-class TFLiteLoader final : public base_loader::BaseLoader<LoaderDomain, TFLiteLoader>
+class TFLiteLoader final : public base_loader::BaseLoader<LoaderDomain>
 {
 public:
   using BaseLoader::BaseLoader;
@@ -78,7 +78,8 @@ public:
     }
   }
 
-  std::unique_ptr<ir::Graph> loadSubgraph(const onert_tflite::SubGraph *tflite_subg)
+private:
+  std::unique_ptr<ir::Graph> loadSubgraph(const onert_tflite::SubGraph *tflite_subg) override
   {
     auto subg = std::make_unique<ir::Graph>();
     // Load tensors
diff --git a/runtime/onert/test/graph/Index.cc b/runtime/onert/test/graph/Index.cc
index 358e64c82..2d110e326 100644
--- a/runtime/onert/test/graph/Index.cc
+++ b/runtime/onert/test/graph/Index.cc
@@ -20,7 +20,7 @@
 
 using Index = ::onert::util::Index<uint32_t, struct TestTag>;
 
-TEST(Index, index_test)
+TEST(Index, neg_index_test)
 {
   Index idx1{1u};
   Index idx2{2u};
diff --git a/runtime/onert/test/graph/operand/IndexSet.cc b/runtime/onert/test/graph/operand/IndexSet.cc
index 6215e0d24..6ef425a2d 100644
--- a/runtime/onert/test/graph/operand/IndexSet.cc
+++ b/runtime/onert/test/graph/operand/IndexSet.cc
@@ -21,7 +21,7 @@
 using onert::ir::OperandIndex;
 using onert::ir::OperandIndexSequence;
 
-TEST(graph_OperandIndexSequence, append)
+TEST(graph_OperandIndexSequence, neg_append)
 {
   OperandIndexSequence iset{0, 2, 4, 8};
 
@@ -42,7 +42,7 @@ TEST(graph_OperandIndexSequence, append)
   ASSERT_FALSE(iset.contains(OperandIndex{11}));
 }
 
-TEST(graph_OperandIndexSequence, replace)
+TEST(graph_OperandIndexSequence, neg_replace)
 {
   OperandIndexSequence iset{0, 1, 2, 3};
 
diff --git a/runtime/onert/test/graph/operand/LayoutSet.cc b/runtime/onert/test/graph/operand/LayoutSet.cc
index e35bddd8b..ef965a41e 100644
--- a/runtime/onert/test/graph/operand/LayoutSet.cc
+++ b/runtime/onert/test/graph/operand/LayoutSet.cc
@@ -21,7 +21,22 @@
 using onert::ir::Layout;
 using onert::ir::LayoutSet;
 
-TEST(graph_operand_LayoutSet, layout_set_operators)
+TEST(graph_operand_LayoutSet, neg_add_remove)
+{
+  LayoutSet set{Layout::NCHW};
+  set.remove(Layout::NHWC);
+  ASSERT_EQ(set.size(), 1);
+  set.add(Layout::NHWC);
+  ASSERT_EQ(set.size(), 2);
+  set.remove(Layout::NHWC);
+  ASSERT_EQ(set.size(), 1);
+  set.remove(Layout::NCHW);
+  ASSERT_EQ(set.size(), 0);
+  set.remove(Layout::NCHW);
+  ASSERT_EQ(set.size(), 0);
+}
+
+TEST(graph_operand_LayoutSet, set_operators)
 {
   LayoutSet set1{Layout::NCHW};
   LayoutSet set2{Layout::NHWC};
diff --git a/runtime/onert/test/graph/operand/Set.cc b/runtime/onert/test/graph/operand/Set.cc
index 0d35b5581..ffee417b8 100644
--- a/runtime/onert/test/graph/operand/Set.cc
+++ b/runtime/onert/test/graph/operand/Set.cc
@@ -18,7 +18,7 @@
 
 #include "ir/Operands.h"
 
-TEST(graph_operand_Set, set_test)
+TEST(graph_operand_Set, neg_set_test)
 {
   onert::ir::Operands set;
 
diff --git a/runtime/onert/test/graph/operand/UseDef.cc b/runtime/onert/test/graph/operand/UseDef.cc
index cd2cdb739..a8686eb18 100644
--- a/runtime/onert/test/graph/operand/UseDef.cc
+++ b/runtime/onert/test/graph/operand/UseDef.cc
@@ -31,7 +31,7 @@ using Mock = onert_test::ir::SimpleMock;
 
 } // namespace
 
-TEST(graph_operand_usedef, usedef_test)
+TEST(graph_operand_usedef, neg_usedef_test)
 {
   onert::ir::Graph graph;
   onert::ir::verifier::DAGChecker verifier;
@@ -62,7 +62,7 @@ TEST(graph_operand_usedef, usedef_test)
 
   graph.finishBuilding();
 
-  ASSERT_EQ(verifier.verify(graph), true);
+  ASSERT_TRUE(verifier.verify(graph));
 
   // Check def
   ASSERT_EQ(graph.operands().at(operand_index1).getDef(), mocknode_index1);
diff --git a/runtime/onert/test/graph/operation/SetIO.cc b/runtime/onert/test/graph/operation/SetIO.cc
index 378c5b4b9..22068ff58 100644
--- a/runtime/onert/test/graph/operation/SetIO.cc
+++ b/runtime/onert/test/graph/operation/SetIO.cc
@@ -62,7 +62,7 @@ TEST(graph_operation_setIO, operation_setIO_conv)
   ASSERT_EQ(conv->getInputs().at(Index{0}).value(), 8);
 }
 
-TEST(graph_operation_setIO, operation_setIO_concat)
+TEST(graph_operation_setIO, neg_operation_setIO_concat)
 {
   onert::ir::Graph graph;
 
diff --git a/runtime/onert/test/graph/verifier/Verifier.cc b/runtime/onert/test/graph/verifier/Verifier.cc
index f8c7557e3..3bce2746c 100644
--- a/runtime/onert/test/graph/verifier/Verifier.cc
+++ b/runtime/onert/test/graph/verifier/Verifier.cc
@@ -45,5 +45,54 @@ TEST(Verifier, dag_checker)
 
   onert::ir::verifier::DAGChecker verifier;
 
-  ASSERT_EQ(verifier.verify(graph), true);
+  ASSERT_TRUE(verifier.verify(graph));
+}
+
+TEST(Verifier, neg_edge_consistency_checker_1)
+{
+  onert::ir::Graph graph;
+
+  onert::ir::Shape shape{3};
+  onert::ir::TypeInfo type{onert::ir::DataType::INT32};
+
+  auto operand1 = graph.addOperand(shape, type);
+  auto operand2 = graph.addOperand(shape, type);
+
+  graph.addInput(operand1);
+  graph.addOutput(operand2);
+
+  auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2});
+  auto op_ind = graph.addOperation(std::move(mock_op));
+
+  graph.finishBuilding();
+
+  graph.operands().at(operand1).removeUse(op_ind); // Manipulate the operand alone
+
+  onert::ir::verifier::EdgeConsistencyChecker verifier;
+  ASSERT_FALSE(verifier.verify(graph));
+}
+
+TEST(Verifier, neg_edge_consistency_checker_2)
+{
+  onert::ir::Graph graph;
+
+  onert::ir::Shape shape{3};
+  onert::ir::TypeInfo type{onert::ir::DataType::INT32};
+
+  auto operand1 = graph.addOperand(shape, type);
+  auto operand2 = graph.addOperand(shape, type);
+
+  graph.addInput(operand1);
+  graph.addOutput(operand2);
+
+  auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2});
+  auto mock_op_ptr = mock_op.get();
+  auto op_ind = graph.addOperation(std::move(mock_op));
+
+  graph.finishBuilding();
+
+  mock_op_ptr->setInputs({operand2}); // Manipulate the operation alone
+
+  onert::ir::verifier::EdgeConsistencyChecker verifier;
+  ASSERT_FALSE(verifier.verify(graph));
 }
diff --git a/runtime/onert/test/util/ShapeInference.cc b/runtime/onert/test/util/ShapeInference.cc
index aab33fab5..a5f0af5ee 100644
--- a/runtime/onert/test/util/ShapeInference.cc
+++ b/runtime/onert/test/util/ShapeInference.cc
@@ -34,7 +34,7 @@ TEST(ShapeInference, Elementwise)
   ASSERT_EQ(infered_out_shape.dim(3), 3);
 }
 
-TEST(ShapeInference, IncorrectElementwise)
+TEST(ShapeInference, neg_Elementwise)
 {
   Shape lhs_shape{1, 299, 299, 3};
   Shape rhs_shape{5, 3};
@@ -123,6 +123,18 @@ TEST(ShapeInference, Pool2DNodeExplicit)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 20);
 }
 
+TEST(ShapeInference, neg_Pool2DNode_InvalidStride)
+{
+  Shape in_shape{10, 6, 12, 20};
+  Stride stride{0, 7};
+  Padding padding{PaddingType::SAME};
+
+  operation::Pool2D::Param avg_pool_param{
+      operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
+  ASSERT_THROW(onert::shape_inference::inferPoolShape(in_shape, avg_pool_param),
+               std::runtime_error);
+}
+
 TEST(ShapeInference, Conv2D)
 {
   Shape in_shape{10, 6, 12, 20};
@@ -159,6 +171,17 @@ TEST(ShapeInference, Conv2D)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 30);
 }
 
+TEST(ShapeInference, neg_Conv2D_InvalidStride)
+{
+  Shape in_shape{10, 6, 12, 20};
+  Shape ker_shape{30, 3, 6, 20};
+
+  operation::Conv2D::Param param{Stride{0, 0}, Padding{PaddingType::VALID}, Activation::NONE,
+                                 Dilation{1, 1}};
+  ASSERT_THROW(onert::shape_inference::inferConv2DShape(in_shape, ker_shape, param),
+               std::runtime_error);
+}
+
 TEST(ShapeInference, DepthwiseConv2D)
 {
   Shape in_shape{10, 6, 12, 20};
@@ -195,6 +218,17 @@ TEST(ShapeInference, DepthwiseConv2D)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 60);
 }
 
+TEST(ShapeInference, neg_DepthwiseConv2D_InvalidSride)
+{
+  Shape in_shape{10, 6, 12, 20};
+  Shape ker_shape{1, 3, 6, 60};
+
+  operation::DepthwiseConv2D::Param param{Stride{3, 0}, Padding{PaddingType::VALID}, 3,
+                                          Activation::NONE};
+  ASSERT_THROW(onert::shape_inference::inferDepthwiseConv2DShape(in_shape, ker_shape, param),
+               std::runtime_error);
+}
+
 TEST(ShapeInference, Concat)
 {
   {
@@ -328,7 +362,8 @@ TEST(ShapeInference, Transpose)
     // pre-conditions
     ASSERT_EQ(in_shape.rank(), perm.size());
     ASSERT_EQ(expected.rank(), perm.size());
-    auto inferred_out_shape = onert::shape_inference::inferTransposeShape(in_shape, perm);
+    auto inferred_out_shape =
+        onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size());
     // post-conditions
     ASSERT_EQ(inferred_out_shape.rank(), perm.size());
     for (int32_t dim = 0; dim < expected.rank(); dim++)
@@ -369,12 +404,141 @@ TEST(ShapeInference, neg_Transpose)
   {
     std::vector<int> perm = {2, 0, 1, 0};
     // int32_t rank = 3;
-    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm), std::runtime_error);
+    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()),
+                 std::runtime_error);
   }
   // Invalid parameter value
   {
     std::vector<int> perm = {2, 0, 3};
     // int32_t rank = 3;
-    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm), std::runtime_error);
+    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()),
+                 std::runtime_error);
+  }
+}
+
+TEST(ShapeInference, Gather)
+{
+  auto check = [&](Shape &input, Shape &indices, Shape &expected, int32_t axis) {
+    int rank = input.rank();
+    auto actual = onert::shape_inference::inferGatherShape(input, indices, axis, rank);
+
+    ASSERT_EQ(actual.rank(), expected.rank());
+
+    for (int32_t dim = 0; dim < expected.rank(); dim++)
+      ASSERT_EQ(actual.dim(dim), expected.dim(dim));
+  };
+
+  // check for 2-D, 3-D, axis 0
+  {
+    Shape input{3, 4};
+    Shape indices{1, 1, 2};
+    int32_t axis = 0;
+    Shape expected{1, 1, 2, 4};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 2-D, 3-D, axis 1
+  {
+    Shape input{3, 4};
+    Shape indices{1, 2, 1};
+    int32_t axis = 1;
+    Shape expected{3, 1, 2, 1};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 3-D, 2-D, axis 0
+  {
+    Shape input{2, 3, 4};
+    Shape indices{1, 2};
+    int32_t axis = 0;
+    Shape expected{1, 2, 3, 4};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 3-D, 2-D, axis 2
+  {
+    Shape input{2, 3, 4};
+    Shape indices{2, 1};
+    int32_t axis = 2;
+    Shape expected{2, 3, 2, 1};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 4D, axis 0
+  {
+    Shape input{1, 2, 3, 4};
+    Shape indices{2};
+    int32_t axis = 0;
+    Shape expected{2, 2, 3, 4};
+    check(input, indices, expected, axis);
+  }
+}
+
+TEST(ShapeInference, BCQFullyConnected)
+{
+  auto check = [&](Shape &in_shape, Shape &cluster_shape, std::vector<int> cluster,
+                   Shape &expected) {
+    auto actual = onert::shape_inference::inferBCQFullyConnectedShape(in_shape, cluster_shape,
+                                                                      cluster.data());
+    ASSERT_EQ(actual.rank(), expected.rank());
+
+    for (int32_t dim = 0; dim < expected.rank(); dim++)
+      ASSERT_EQ(actual.dim(dim), expected.dim(dim));
+  };
+
+  {
+    Shape in_shape{10, 1};
+    Shape cluster_shape{3, 2};
+    std::vector<int> cluster = {1, 10, 2, 10, 3, 10};
+
+    Shape expected{30, 1};
+    check(in_shape, cluster_shape, cluster, expected);
+  }
+
+  {
+    Shape in_shape{1, 1};
+    Shape cluster_shape{1, 2};
+    std::vector<int> cluster = {3, 50};
+
+    Shape expected{50, 1};
+    check(in_shape, cluster_shape, cluster, expected);
+  }
+}
+
+TEST(ShapeInference, BCQGather)
+{
+  auto check = [&](Shape &indices_shape, Shape &cluster_shape, std::vector<int> cluster,
+                   uint32_t hidden_size, uint32_t axis, int rank, Shape &expected) {
+    operation::BCQGather::Param param{hidden_size, axis};
+    auto actual = onert::shape_inference::inferBCQGatherShape(indices_shape, cluster_shape,
+                                                              cluster.data(), rank, param);
+    ASSERT_EQ(actual.rank(), expected.rank());
+
+    for (int32_t dim = 0; dim < expected.rank(); dim++)
+      ASSERT_EQ(actual.dim(dim), expected.dim(dim));
+  };
+
+  {
+    Shape indices_shape{5, 1};
+    Shape cluster_shape{3, 2};
+    std::vector<int> cluster = {1, 10, 2, 10, 3, 10};
+    uint32_t hidden_size = 10;
+    uint32_t axis = 0;
+    int rank = 2;
+
+    Shape expected{5, 1, 10};
+    check(indices_shape, cluster_shape, cluster, hidden_size, axis, rank, expected);
+  }
+
+  {
+    Shape indices_shape{5, 1};
+    Shape cluster_shape{3, 2};
+    std::vector<int> cluster = {1, 10, 2, 10, 3, 10};
+    uint32_t hidden_size = 10;
+    uint32_t axis = 1;
+    int rank = 2;
+
+    Shape expected{30, 5, 1};
+    check(indices_shape, cluster_shape, cluster, hidden_size, axis, rank, expected);
   }
 }
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
index 984dbfa2a..daebe2ae0 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
@@ -1,6 +1,8 @@
 GeneratedTests.abs_
 GeneratedTests.abs_dynamic_nnfw
 GeneratedTests.add_dynamic_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
 GeneratedTests.argmax_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_float_adj_x
@@ -58,6 +60,8 @@ GeneratedTests.einsum_ex_float_matmul_4x4_4_2
 GeneratedTests.equal_dynamic_float_nnfw
 GeneratedTests.exp_
 GeneratedTests.exp_dynamic_nnfw
+GeneratedTests.expand_dims_dynamic_nnfw_1
+GeneratedTests.expand_dims_dynamic_nnfw_2
 GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
@@ -92,6 +96,7 @@ GeneratedTests.logical_not_1D_nnfw
 GeneratedTests.logical_not_4D_nnfw
 GeneratedTests.logical_not_dynamic_nnfw
 GeneratedTests.logical_or_broadcast
+GeneratedTests.logical_or_dynamic_nnfw
 GeneratedTests.logistic_dynamic_nnfw
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
@@ -110,6 +115,7 @@ GeneratedTests.matrix_band_part_ex_4D_float
 GeneratedTests.matrix_band_part_ex_dynamic_nnfw
 GeneratedTests.maximum_dynamic_nnfw
 GeneratedTests.minimum_dynamic_nnfw
+GeneratedTests.minimum_int32
 GeneratedTests.mul_dynamic_nnfw
 GeneratedTests.neg
 GeneratedTests.neg_dynamic_nnfw
@@ -169,6 +175,30 @@ GeneratedTests.reduce_prod_dynamic_2_nnfw
 GeneratedTests.reduce_sum_dynamic_1_nnfw
 GeneratedTests.reduce_sum_dynamic_2_nnfw
 GeneratedTests.reshape_dynamic_nnfw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.reverse_ex_1d
 GeneratedTests.reverse_ex_3d
 GeneratedTests.reverse_ex_dynamic_1D
@@ -210,6 +240,7 @@ GeneratedTests.slice_zero_sized_quant8
 GeneratedTests.softmax_dynamic_nnfw
 GeneratedTests.space_to_batch_dynamic_float_nnfw
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_float_5_axis_as_input_nnfw
 GeneratedTests.split_v_ex_1D_float_1_nnfw
 GeneratedTests.split_v_ex_1D_float_2_nnfw
 GeneratedTests.split_v_ex_1D_int32_nnfw
@@ -252,8 +283,7 @@ GeneratedTests.tile_3_float16
 GeneratedTests.tile_3_int32
 GeneratedTests.tile_3_quant8
 GeneratedTests.transpose_dynamic_nnfw
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
 GeneratedTests.unpack_ex_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
index a7bedf14b..3de2c6835 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
@@ -1,6 +1,8 @@
 GeneratedTests.abs_
 GeneratedTests.abs_dynamic_nnfw
 GeneratedTests.add_dynamic_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
 GeneratedTests.argmax_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_float_adj_x
@@ -59,6 +61,8 @@ GeneratedTests.equal_dynamic_float_nnfw
 GeneratedTests.exp_
 GeneratedTests.exp_2D_float_nnfw
 GeneratedTests.exp_dynamic_nnfw
+GeneratedTests.expand_dims_dynamic_nnfw_1
+GeneratedTests.expand_dims_dynamic_nnfw_2
 GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
@@ -94,6 +98,7 @@ GeneratedTests.logical_not
 GeneratedTests.logical_not_1D_nnfw
 GeneratedTests.logical_not_4D_nnfw
 GeneratedTests.logical_not_dynamic_nnfw
+GeneratedTests.logical_or_dynamic_nnfw
 GeneratedTests.logistic_dynamic_nnfw
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
@@ -174,6 +179,78 @@ GeneratedTests.reduce_prod_dynamic_2_nnfw
 GeneratedTests.reduce_sum_dynamic_1_nnfw
 GeneratedTests.reduce_sum_dynamic_2_nnfw
 GeneratedTests.reshape_dynamic_nnfw
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.reverse_ex_1d
 GeneratedTests.reverse_ex_3d
 GeneratedTests.reverse_ex_dynamic_1D
@@ -222,6 +299,7 @@ GeneratedTests.space_to_batch_quant8_2
 GeneratedTests.space_to_batch_quant8_2_nnfw
 GeneratedTests.space_to_batch_quant8_3
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_float_5_axis_as_input_nnfw
 GeneratedTests.split_v_ex_1D_float_1_nnfw
 GeneratedTests.split_v_ex_1D_float_2_nnfw
 GeneratedTests.split_v_ex_1D_int32_nnfw
@@ -276,8 +354,7 @@ GeneratedTests.topk_v2_4
 GeneratedTests.topk_v2_5
 GeneratedTests.topk_v2_6
 GeneratedTests.transpose_dynamic_nnfw
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
 GeneratedTests.unpack_ex_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
index e98007e08..c15aecdfa 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
@@ -110,6 +110,78 @@ GeneratedTests.relu6_quant8_1
 GeneratedTests.relu6_quant8_2
 GeneratedTests.relu_quant8_1
 GeneratedTests.relu_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.rnn
 GeneratedTests.rnn_state
 GeneratedTests.rsqrt
@@ -171,7 +243,5 @@ GeneratedTests.transpose_conv_ex_float_1
 GeneratedTests.transpose_conv_ex_float_2
 GeneratedTests.transpose_conv_ex_float_3
 GeneratedTests.transpose_conv_ex_float_4
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
index 984dbfa2a..a9f6bf5b1 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
@@ -1,6 +1,8 @@
 GeneratedTests.abs_
 GeneratedTests.abs_dynamic_nnfw
 GeneratedTests.add_dynamic_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
 GeneratedTests.argmax_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_float_adj_x
@@ -58,6 +60,8 @@ GeneratedTests.einsum_ex_float_matmul_4x4_4_2
 GeneratedTests.equal_dynamic_float_nnfw
 GeneratedTests.exp_
 GeneratedTests.exp_dynamic_nnfw
+GeneratedTests.expand_dims_dynamic_nnfw_1
+GeneratedTests.expand_dims_dynamic_nnfw_2
 GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
@@ -92,6 +96,7 @@ GeneratedTests.logical_not_1D_nnfw
 GeneratedTests.logical_not_4D_nnfw
 GeneratedTests.logical_not_dynamic_nnfw
 GeneratedTests.logical_or_broadcast
+GeneratedTests.logical_or_dynamic_nnfw
 GeneratedTests.logistic_dynamic_nnfw
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
@@ -110,6 +115,7 @@ GeneratedTests.matrix_band_part_ex_4D_float
 GeneratedTests.matrix_band_part_ex_dynamic_nnfw
 GeneratedTests.maximum_dynamic_nnfw
 GeneratedTests.minimum_dynamic_nnfw
+GeneratedTests.minimum_int32
 GeneratedTests.mul_dynamic_nnfw
 GeneratedTests.neg
 GeneratedTests.neg_dynamic_nnfw
@@ -169,6 +175,32 @@ GeneratedTests.reduce_prod_dynamic_2_nnfw
 GeneratedTests.reduce_sum_dynamic_1_nnfw
 GeneratedTests.reduce_sum_dynamic_2_nnfw
 GeneratedTests.reshape_dynamic_nnfw
+GeneratedTests.reshape_quant8_weights_as_inputs
+GeneratedTests.reshape_weights_as_inputs
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.reverse_ex_1d
 GeneratedTests.reverse_ex_3d
 GeneratedTests.reverse_ex_dynamic_1D
@@ -210,6 +242,7 @@ GeneratedTests.slice_zero_sized_quant8
 GeneratedTests.softmax_dynamic_nnfw
 GeneratedTests.space_to_batch_dynamic_float_nnfw
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_float_5_axis_as_input_nnfw
 GeneratedTests.split_v_ex_1D_float_1_nnfw
 GeneratedTests.split_v_ex_1D_float_2_nnfw
 GeneratedTests.split_v_ex_1D_int32_nnfw
@@ -252,8 +285,7 @@ GeneratedTests.tile_3_float16
 GeneratedTests.tile_3_int32
 GeneratedTests.tile_3_quant8
 GeneratedTests.transpose_dynamic_nnfw
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
 GeneratedTests.unpack_ex_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index 036c869c6..2f313c759 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -1,6 +1,8 @@
 GeneratedTests.abs_
 GeneratedTests.abs_dynamic_nnfw
 GeneratedTests.add_dynamic_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
 GeneratedTests.argmax_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_dynamic_nnfw
 GeneratedTests.batch_matmul_ex_float_adj_x
@@ -59,6 +61,8 @@ GeneratedTests.equal_dynamic_float_nnfw
 GeneratedTests.exp_
 GeneratedTests.exp_2D_float_nnfw
 GeneratedTests.exp_dynamic_nnfw
+GeneratedTests.expand_dims_dynamic_nnfw_1
+GeneratedTests.expand_dims_dynamic_nnfw_2
 GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
@@ -93,6 +97,7 @@ GeneratedTests.logical_not
 GeneratedTests.logical_not_1D_nnfw
 GeneratedTests.logical_not_4D_nnfw
 GeneratedTests.logical_not_dynamic_nnfw
+GeneratedTests.logical_or_dynamic_nnfw
 GeneratedTests.logistic_dynamic_nnfw
 GeneratedTests.lsh_projection
 GeneratedTests.lsh_projection_2
@@ -173,6 +178,80 @@ GeneratedTests.reduce_prod_dynamic_2_nnfw
 GeneratedTests.reduce_sum_dynamic_1_nnfw
 GeneratedTests.reduce_sum_dynamic_2_nnfw
 GeneratedTests.reshape_dynamic_nnfw
+GeneratedTests.reshape_quant8_weights_as_inputs
+GeneratedTests.reshape_weights_as_inputs
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.reverse_ex_1d
 GeneratedTests.reverse_ex_3d
 GeneratedTests.reverse_ex_dynamic_1D
@@ -214,6 +293,7 @@ GeneratedTests.slice_zero_sized_quant8
 GeneratedTests.softmax_dynamic_nnfw
 GeneratedTests.space_to_batch_dynamic_float_nnfw
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_float_5_axis_as_input_nnfw
 GeneratedTests.split_v_ex_1D_float_1_nnfw
 GeneratedTests.split_v_ex_1D_float_2_nnfw
 GeneratedTests.split_v_ex_1D_int32_nnfw
@@ -268,8 +348,7 @@ GeneratedTests.topk_v2_4
 GeneratedTests.topk_v2_5
 GeneratedTests.topk_v2_6
 GeneratedTests.transpose_dynamic_nnfw
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
 GeneratedTests.unpack_ex_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
index e98007e08..c15aecdfa 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
@@ -110,6 +110,78 @@ GeneratedTests.relu6_quant8_1
 GeneratedTests.relu6_quant8_2
 GeneratedTests.relu_quant8_1
 GeneratedTests.relu_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.rnn
 GeneratedTests.rnn_state
 GeneratedTests.rsqrt
@@ -171,7 +243,5 @@ GeneratedTests.transpose_conv_ex_float_1
 GeneratedTests.transpose_conv_ex_float_2
 GeneratedTests.transpose_conv_ex_float_3
 GeneratedTests.transpose_conv_ex_float_4
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
index 9e51e759e..027c878f1 100644
--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
+++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
@@ -12,6 +12,8 @@ GeneratedTests.argmax_1_quant8
 GeneratedTests.argmax_2
 GeneratedTests.argmax_2_quant8
 GeneratedTests.argmax_3
+GeneratedTests.argmax_3_axis_as_input_nnfw
+GeneratedTests.argmax_3_axis_as_input_nnfw_quant8
 GeneratedTests.argmax_3_quant8
 GeneratedTests.argmax_dynamic_nnfw
 GeneratedTests.argmax_float_1_nnfw
@@ -245,6 +247,7 @@ GeneratedTests.logical_or_4D_nnfw
 GeneratedTests.logical_or_broadcast
 GeneratedTests.logical_or_broadcast_4D_2D_nnfw
 GeneratedTests.logical_or_broadcast_nnfw
+GeneratedTests.logical_or_dynamic_nnfw
 GeneratedTests.logical_or_simple
 GeneratedTests.logistic_dynamic_nnfw
 GeneratedTests.logistic_quant8_1
@@ -291,6 +294,7 @@ GeneratedTests.minimum_overflow
 GeneratedTests.minimum_quant8_nnfw
 GeneratedTests.minimum_simple
 GeneratedTests.minimum_simple_quant8
+GeneratedTests.minimum_int32
 GeneratedTests.mul_broadcast_quant8
 GeneratedTests.mul_dynamic_nnfw
 GeneratedTests.mul_quant8
@@ -316,6 +320,9 @@ GeneratedTests.not_equal_quantized_overflow_first_input_if_requantized
 GeneratedTests.not_equal_quantized_overflow_second_input_if_requantized
 GeneratedTests.not_equal_simple
 GeneratedTests.one_hot_ex_dynamic_nnfw
+GeneratedTests.one_hot_ex_float_1_nnfw
+GeneratedTests.one_hot_ex_float_2_nnfw
+GeneratedTests.one_hot_ex_float_off_value_constant_zero_nnfw
 GeneratedTests.pack_ex_2D_float_1
 GeneratedTests.pack_ex_2D_float_2
 GeneratedTests.pack_ex_2D_int_1
@@ -434,6 +441,78 @@ GeneratedTests.reshape_dynamic_nnfw
 GeneratedTests.resize_bilinear
 GeneratedTests.resize_bilinear_2
 GeneratedTests.resize_bilinear_quant8_nnfw
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.reverse_ex_1d
 GeneratedTests.reverse_ex_3d
 GeneratedTests.reverse_ex_dynamic_1D
@@ -512,6 +591,7 @@ GeneratedTests.split_float_2
 GeneratedTests.split_float_3
 GeneratedTests.split_float_4
 GeneratedTests.split_float_5
+GeneratedTests.split_float_5_axis_as_input_nnfw
 GeneratedTests.split_int32_1
 GeneratedTests.split_int32_1_relaxed
 GeneratedTests.split_int32_2
@@ -626,6 +706,7 @@ GeneratedTests.transpose_2D_nnfw
 GeneratedTests.transpose_3D_nnfw
 GeneratedTests.transpose_dynamic_nnfw
 GeneratedTests.transpose_float_1
+GeneratedTests.transpose_float_1_perms_as_input_nnfw
 GeneratedTests.transpose_quant8_1
 GeneratedTests.transpose_v1_2
 GeneratedTests.transpose_v1_2_quant8
diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
index e98007e08..05cb06e7e 100644
--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
@@ -67,6 +67,7 @@ GeneratedTests.lstm_state2
 GeneratedTests.maximum_broadcast_quant8
 GeneratedTests.maximum_overflow
 GeneratedTests.maximum_simple_quant8
+GeneratedTests.minimum_int32
 GeneratedTests.minimum_broadcast_quant8
 GeneratedTests.minimum_overflow
 GeneratedTests.minimum_simple_quant8
@@ -110,6 +111,78 @@ GeneratedTests.relu6_quant8_1
 GeneratedTests.relu6_quant8_2
 GeneratedTests.relu_quant8_1
 GeneratedTests.relu_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nchw
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_scale_nchw
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_2
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_2
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_2
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_2
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_3
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_3
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_3
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_3
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_4
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_4
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_4
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_4
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_5
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_5
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_5
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_5
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_6
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_6
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_6
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_6
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_7
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_7
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_7
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_7
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_8
+GeneratedTests.resize_nearest_neighbor_shape_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_8
+GeneratedTests.resize_nearest_neighbor_shape_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_8
+GeneratedTests.resize_nearest_neighbor_scale_nhwc_quant8_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_8
+GeneratedTests.resize_nearest_neighbor_scale_nchw_quant8_8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nhwc_quant8_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_2
+GeneratedTests.resize_nearest_neighbor_zero_sized_nchw_quant8_2
 GeneratedTests.rnn
 GeneratedTests.rnn_state
 GeneratedTests.rsqrt
@@ -171,7 +244,5 @@ GeneratedTests.transpose_conv_ex_float_1
 GeneratedTests.transpose_conv_ex_float_2
 GeneratedTests.transpose_conv_ex_float_3
 GeneratedTests.transpose_conv_ex_float_4
-GeneratedTests.transpose_v1_2
-GeneratedTests.transpose_v1_2_quant8
 GeneratedTests.transpose_v1_2_zero_sized
 GeneratedTests.transpose_v1_2_zero_sized_quant8
diff --git a/tests/nnapi/specs/Ex/one_hot_ex_float_1_nnfw.mod.py b/tests/nnapi/specs/Ex/one_hot_ex_float_1_nnfw.mod.py
new file mode 100644
index 000000000..1bdf22b92
--- /dev/null
+++ b/tests/nnapi/specs/Ex/one_hot_ex_float_1_nnfw.mod.py
@@ -0,0 +1,48 @@
+#
+# Copyright (C) 2018 The Android Open Source Project
+# Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model = Model()
+
+
+indices = Input("indices", "TENSOR_INT32", "{2, 2}")
+depth = Parameter("depth", "TENSOR_INT32", "{1}", [3])
+onvalue = Input("onvalue", "TENSOR_FLOAT32", "{1}")
+offvalue = Input("offvalue", "TENSOR_FLOAT32", "{1}")
+
+axis0 = Int32Scalar("axis", -1) # default value is -1.
+model_output0 = Output("output", "TENSOR_FLOAT32", "{2, 2, 3}")
+
+model0 = model.Operation("ONE_HOT_EX", indices, depth, onvalue, offvalue, axis0).To(model_output0)
+
+model_output_data = ([0., 1., 0.,
+                      0., 0., 1.,
+                      1., 0., 0.,
+                      0., 0., 1.,])
+
+indices_data = [1, 2, 0, 2]
+onvalue_data = [1.]
+offvalue_data = [0.]
+
+Example(
+  {
+    indices : indices_data,
+    onvalue : onvalue_data,
+    offvalue : offvalue_data,
+
+    model_output0 : model_output_data,
+  })
+
diff --git a/tests/nnapi/specs/Ex/one_hot_ex_float_2_nnfw.mod.py b/tests/nnapi/specs/Ex/one_hot_ex_float_2_nnfw.mod.py
new file mode 100644
index 000000000..6a41488df
--- /dev/null
+++ b/tests/nnapi/specs/Ex/one_hot_ex_float_2_nnfw.mod.py
@@ -0,0 +1,47 @@
+#
+# Copyright (C) 2018 The Android Open Source Project
+# Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model = Model()
+
+
+indices = Input("indices", "TENSOR_INT32", "{1, 2, 2}")
+depth = Parameter("depth", "TENSOR_INT32", "{1}", [3])
+onvalue = Input("onvalue", "TENSOR_FLOAT32", "{1}")
+offvalue = Input("offvalue", "TENSOR_FLOAT32", "{1}")
+
+axis0 = Int32Scalar("axis", 1) # default value is -1.
+model_output0 = Output("output", "TENSOR_FLOAT32", "{1, 3, 2, 2}")
+
+model0 = model.Operation("ONE_HOT_EX", indices, depth, onvalue, offvalue, axis0).To(model_output0)
+
+model_output_data = ([1., 1., 2., 1.,
+		              2., 1., 1., 1.,
+					  1., 2., 1., 2.,])
+
+indices_data = [1, 2, 0, 2]
+onvalue_data = [2.]
+offvalue_data = [1.]
+
+Example(
+  {
+    indices : indices_data,
+    onvalue : onvalue_data,
+    offvalue : offvalue_data,
+
+    model_output0 : model_output_data,
+  })
+
diff --git a/tests/nnapi/specs/Ex/one_hot_ex_float_off_value_constant_zero_nnfw.mod.py b/tests/nnapi/specs/Ex/one_hot_ex_float_off_value_constant_zero_nnfw.mod.py
new file mode 100644
index 000000000..fc8859baa
--- /dev/null
+++ b/tests/nnapi/specs/Ex/one_hot_ex_float_off_value_constant_zero_nnfw.mod.py
@@ -0,0 +1,45 @@
+#
+# Copyright (C) 2018 The Android Open Source Project
+# Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model = Model()
+
+
+indices = Input("indices", "TENSOR_INT32", "{1, 2, 2}")
+depth = Parameter("depth", "TENSOR_INT32", "{1}", [3])
+onvalue = Input("onvalue", "TENSOR_FLOAT32", "{1}")
+offvalue = Parameter("offvalue", "TENSOR_FLOAT32", "{1}", [0.])
+
+axis0 = Int32Scalar("axis", 2) # default value is -1.
+model_output0 = Output("output", "TENSOR_FLOAT32", "{1, 2, 3, 2}")
+
+model0 = model.Operation("ONE_HOT_EX", indices, depth, onvalue, offvalue, axis0).To(model_output0)
+
+model_output_data = ([0., 0., 1., 0., 0., 1.,
+                      1., 0., 0., 0., 0., 1.,])
+
+
+indices_data = [1, 2, 0, 2]
+onvalue_data = [1.]
+
+Example(
+  {
+    indices : indices_data,
+    onvalue : onvalue_data,
+
+    model_output0 : model_output_data,
+  })
+
diff --git a/tests/nnapi/specs/V1_1/transpose_float_1_perms_as_input_nnfw.mod.py b/tests/nnapi/specs/V1_1/transpose_float_1_perms_as_input_nnfw.mod.py
new file mode 100644
index 000000000..c4040f4bb
--- /dev/null
+++ b/tests/nnapi/specs/V1_1/transpose_float_1_perms_as_input_nnfw.mod.py
@@ -0,0 +1,34 @@
+model = Model()
+i1 = Input("input", "TENSOR_FLOAT32", "{2, 3, 4, 5}")
+perms = Input("perms", "TENSOR_INT32", "{4}")
+output = Output("output", "TENSOR_FLOAT32", "{4, 2, 3, 5}")
+
+model = model.Operation("TRANSPOSE", i1, perms).To(output)
+
+# Example 1. Inputs in operand 0,
+input0 = {i1: # input 0
+          [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
+           12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+           24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+           36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+           48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+           60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+           72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+           84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+           96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+           108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119],
+          perms: # permutations
+          [2, 0, 1, 3]}
+
+output0 = {output: # output 0
+          [0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+           60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+           5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+           65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+           10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+           70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+           15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+           75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/V1_2/argmax_3_axis_as_input_nnfw.mod.py b/tests/nnapi/specs/V1_2/argmax_3_axis_as_input_nnfw.mod.py
new file mode 100644
index 000000000..a3550367f
--- /dev/null
+++ b/tests/nnapi/specs/V1_2/argmax_3_axis_as_input_nnfw.mod.py
@@ -0,0 +1,35 @@
+#
+# Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Negative axis support test.
+
+input0 = Input("input0", "TENSOR_FLOAT32", "{2, 2}")
+axis = Input("axis", "TENSOR_INT32", "{}")
+output0 = Output("output", "TENSOR_INT32", "{2}")
+
+model = Model().Operation("ARGMAX", input0, axis).To(output0)
+
+quant8 = DataTypeConverter().Identify({
+    input0: ["TENSOR_QUANT8_ASYMM", 1.0, 0],
+})
+
+Example({
+    input0: [1.0, 2.0,
+             4.0, 3.0],
+    axis: [-1],
+    output0: [1, 0],
+}).AddVariations("relaxed", "float16", "int32", quant8)
diff --git a/tests/nnapi/specs/V1_2/minimum_int32.mod.py b/tests/nnapi/specs/V1_2/minimum_int32.mod.py
new file mode 100644
index 000000000..7d65f1039
--- /dev/null
+++ b/tests/nnapi/specs/V1_2/minimum_int32.mod.py
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (C) 2020 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+i1 = Input("input0", "TENSOR_INT32", "{3, 1, 2}")
+i2 = Input("input1", "TENSOR_INT32", "{3, 1, 2}")
+i3 = Output("output0", "TENSOR_INT32", "{3, 1, 2}")
+
+model = Model().Operation("MINIMUM", i1, i2).To(i3)
+
+input0 = {i1:
+          [129, 12, 15, 130, -77, 33],
+          i2:
+          [44, 127, -25, 5, 39, 27]}
+
+output0 = {i3:
+           [44, 12, -25, 5, -77, 27]}
+
+Example((input0, output0))
diff --git a/tests/nnapi/specs/skip/V1_2/resize_nearest_neighbor.mod.py b/tests/nnapi/specs/V1_2/resize_nearest_neighbor.mod.py
index 04102c5ed..04102c5ed 100644
--- a/tests/nnapi/specs/skip/V1_2/resize_nearest_neighbor.mod.py
+++ b/tests/nnapi/specs/V1_2/resize_nearest_neighbor.mod.py
diff --git a/tests/nnapi/specs/V1_2/split_float_5_axis_as_input_nnfw.mod.py b/tests/nnapi/specs/V1_2/split_float_5_axis_as_input_nnfw.mod.py
new file mode 100644
index 000000000..9676e1638
--- /dev/null
+++ b/tests/nnapi/specs/V1_2/split_float_5_axis_as_input_nnfw.mod.py
@@ -0,0 +1,38 @@
+#
+# Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# model
+input0 = Input("input0", "TENSOR_FLOAT32", "{2, 2, 2}")
+axis = Input("axis", "TENSOR_INT32", "{}")
+num_splits = Int32Scalar("num_splits", 2)
+output0 = Output("output0", "TENSOR_FLOAT32", "{2, 1, 2}")
+output1 = Output("output1", "TENSOR_FLOAT32", "{2, 1, 2}")
+
+model = Model().Operation("SPLIT", input0, axis, num_splits).To((output0, output1))
+
+# Example 1.
+input_dict = {
+    input0: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+    axis: [-2]
+}
+output_dict = {
+    output0: [1.0, 2.0, 5.0, 6.0],
+    output1: [3.0, 4.0, 7.0, 8.0],
+}
+
+# Instantiate an example
+Example((input_dict, output_dict)).AddVariations("relaxed", "float16")
diff --git a/tests/nnfw_api/README.md b/tests/nnfw_api/README.md
index 7e14fc445..58ba12992 100644
--- a/tests/nnfw_api/README.md
+++ b/tests/nnfw_api/README.md
@@ -16,6 +16,8 @@ This test framework consists of 3 kinds of tests:
 
 ## nnpackages for testing
 
+> NOTE It is not recommended adding a test this way, since you can make a Circle model with some code using `CircleGen` class. See also `GenModelTest`.
+
 To test *nnfw_api*, we almost always need some nnpackages. Those are stored in a web server so there is no nnpackage files in the repo.
 
 ### How to add nnpackages for test
@@ -27,4 +29,4 @@ Once you have done the above steps, please register it in the test source code t
 
 ### Installation
 
-You must install the test nnpackages before running the tests. They must be in the same directory with the test executable, under `nnfw_api_gtest_models/`. There is an installation script `tests/scripts/nnfw_api_gtest/install_nnfw_api_gtest_nnpackages.sh`, however the nnpackage file server is not public so it will fail.
+You must install the test nnpackages before running the tests. They must be in the same directory with the test executable, under `nnfw_api_gtest_models/`. Installation is done by command `onert-test prepare-model`. It only runs correctly on CI, since the nnpackage file server is not public.
diff --git a/tests/nnfw_api/src/CircleGen.cc b/tests/nnfw_api/src/CircleGen.cc
index 19cb95f37..8040f7dfb 100644
--- a/tests/nnfw_api/src/CircleGen.cc
+++ b/tests/nnfw_api/src/CircleGen.cc
@@ -14,22 +14,6 @@
  * limitations under the License.
  */
 
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 #include "CircleGen.h"
 
 CircleGen::CircleGen() : _subgraph_contexts(1) // Create primary subgraph
@@ -54,11 +38,18 @@ uint32_t CircleGen::addBuffer(const uint8_t *buf, size_t size)
 
 uint32_t CircleGen::addTensor(const TensorParams &params)
 {
-  int ind = curSubgCtx().tensors.size();
+  uint32_t ind = curSubgCtx().tensors.size();
   curSubgCtx().tensors.emplace_back(buildTensor(params));
   return ind;
 }
 
+uint32_t CircleGen::addTensor(const TensorParams &params, const SparsityParams &sp)
+{
+  uint32_t ind = curSubgCtx().tensors.size();
+  curSubgCtx().tensors.emplace_back(buildTensor(params, sp));
+  return ind;
+}
+
 void CircleGen::setInputsAndOutputs(const std::vector<int> &inputs, const std::vector<int> &outputs)
 {
   curSubgCtx().inputs = inputs;
@@ -93,6 +84,13 @@ uint32_t CircleGen::addOperatorAdd(const OperatorParams &params,
                                 circle::BuiltinOptions_AddOptions, options);
 }
 
+uint32_t CircleGen::addOperatorArgMax(const OperatorParams &params, circle::TensorType output_type)
+{
+  auto options = circle::CreateArgMaxOptions(_fbb, output_type).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_ARG_MAX,
+                                circle::BuiltinOptions_ArgMaxOptions, options);
+}
+
 uint32_t CircleGen::addOperatorAveragePool2D(const OperatorParams &params, circle::Padding padding,
                                              int stride_w, int stride_h, int filter_w, int filter_h,
                                              circle::ActivationFunctionType actfn)
@@ -104,6 +102,14 @@ uint32_t CircleGen::addOperatorAveragePool2D(const OperatorParams &params, circl
                                 circle::BuiltinOptions_Pool2DOptions, options);
 }
 
+uint32_t CircleGen::addOperatorCast(const OperatorParams &params, circle::TensorType input_type,
+                                    circle::TensorType output_type)
+{
+  auto options = circle::CreateCastOptions(_fbb, input_type, output_type).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_CAST,
+                                circle::BuiltinOptions_AddOptions, options);
+}
+
 uint32_t CircleGen::addOperatorConcatenation(const OperatorParams &params, int axis,
                                              circle::ActivationFunctionType actfn)
 {
@@ -119,6 +125,20 @@ uint32_t CircleGen::addOperatorCos(const OperatorParams &params)
                                 circle::BuiltinOptions_CosOptions, options);
 }
 
+uint32_t CircleGen::addOperatorEqual(const OperatorParams &params)
+{
+  auto options = circle::CreateEqualOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_EQUAL,
+                                circle::BuiltinOptions_EqualOptions, options);
+}
+
+uint32_t CircleGen::addOperatorFullyConnected(const OperatorParams &params)
+{
+  auto options = circle::CreateFullyConnectedOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_FULLY_CONNECTED,
+                                circle::BuiltinOptions_FullyConnectedOptions, options);
+}
+
 uint32_t CircleGen::addOperatorL2Normalization(const OperatorParams &params)
 {
   auto options = circle::CreateL2NormOptions(_fbb).Union();
@@ -140,6 +160,13 @@ uint32_t CircleGen::addOperatorLeakyRelu(const OperatorParams &params, float alp
                                 circle::BuiltinOptions_LeakyReluOptions, options);
 }
 
+uint32_t CircleGen::addOperatorLogSoftmax(const OperatorParams &params)
+{
+  auto options = circle::CreateLogSoftmaxOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_LOG_SOFTMAX,
+                                circle::BuiltinOptions_LogSoftmaxOptions, options);
+}
+
 uint32_t CircleGen::addOperatorNeg(const OperatorParams &params)
 {
   auto options = circle::CreatePadOptions(_fbb).Union();
@@ -147,6 +174,13 @@ uint32_t CircleGen::addOperatorNeg(const OperatorParams &params)
                                 circle::BuiltinOptions_NegOptions, options);
 }
 
+uint32_t CircleGen::addOperatorOneHot(const OperatorParams &params, int32_t axis)
+{
+  auto options = circle::CreateOneHotOptions(_fbb, axis).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_ONE_HOT,
+                                circle::BuiltinOptions_OneHotOptions, options);
+}
+
 uint32_t CircleGen::addOperatorPad(const OperatorParams &params)
 {
   auto options = circle::CreatePadOptions(_fbb).Union();
@@ -168,6 +202,22 @@ uint32_t CircleGen::addOperatorRank(const OperatorParams &params)
                                 circle::BuiltinOptions_RankOptions, options);
 }
 
+uint32_t CircleGen::addOperatorReshape(const OperatorParams &params, const Shape &new_shape)
+{
+  auto options = circle::CreateReshapeOptionsDirect(_fbb, &new_shape).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_RESHAPE,
+                                circle::BuiltinOptions_ReshapeOptions, options);
+}
+
+uint32_t CircleGen::addOperatorResizeBilinear(const OperatorParams &params, bool align_corners,
+                                              bool half_pixel_centers)
+{
+  auto options =
+      circle::CreateResizeBilinearOptions(_fbb, align_corners, half_pixel_centers).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_RESIZE_BILINEAR,
+                                circle::BuiltinOptions_ResizeBilinearOptions, options);
+}
+
 uint32_t CircleGen::addOperatorResizeNearestNeighbor(const OperatorParams &params)
 {
   auto options = circle::CreateResizeNearestNeighborOptions(_fbb).Union();
@@ -175,6 +225,36 @@ uint32_t CircleGen::addOperatorResizeNearestNeighbor(const OperatorParams &param
                                 circle::BuiltinOptions_ResizeNearestNeighborOptions, options);
 }
 
+uint32_t CircleGen::addOperatorReverseV2(const OperatorParams &params)
+{
+  auto options = circle::CreateReverseV2Options(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_REVERSE_V2,
+                                circle::BuiltinOptions_ReverseV2Options, options);
+}
+
+uint32_t CircleGen::addOperatorSplit(const OperatorParams &params, int32_t num_split)
+{
+  auto options = circle::CreateSplitOptions(_fbb, num_split).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_SPLIT,
+                                circle::BuiltinOptions_SplitOptions, options);
+}
+uint32_t CircleGen::addOperatorStridedSlice(const OperatorParams &params, int32_t begin_mask,
+                                            int32_t end_mask, int32_t ellipsis_mask,
+                                            int32_t new_axis_mask, int32_t shrink_axis_mask)
+{
+  auto options = circle::CreateStridedSliceOptions(_fbb, begin_mask, end_mask, ellipsis_mask,
+                                                   new_axis_mask, shrink_axis_mask)
+                     .Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_STRIDED_SLICE,
+                                circle::BuiltinOptions_StridedSliceOptions, options);
+}
+uint32_t CircleGen::addOperatorTile(const OperatorParams &params)
+{
+  auto options = circle::CreateTileOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_TILE,
+                                circle::BuiltinOptions_TileOptions, options);
+}
+
 uint32_t CircleGen::addOperatorWhile(const OperatorParams &params, uint32_t cond_subg,
                                      uint32_t body_subg)
 {
@@ -183,6 +263,29 @@ uint32_t CircleGen::addOperatorWhile(const OperatorParams &params, uint32_t cond
                                 circle::BuiltinOptions_WhileOptions, options);
 }
 
+uint32_t CircleGen::addOperatorIf(const OperatorParams &params, uint32_t then_subg,
+                                  uint32_t else_subg)
+{
+  auto options = circle::CreateIfOptions(_fbb, then_subg, else_subg).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_IF,
+                                circle::BuiltinOptions_IfOptions, options);
+}
+
+uint32_t CircleGen::addOperatorInstanceNorm(const OperatorParams &params, float epsilon,
+                                            circle::ActivationFunctionType actfn)
+{
+  auto options = circle::CreateInstanceNormOptions(_fbb, epsilon, actfn).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_INSTANCE_NORM,
+                                circle::BuiltinOptions_InstanceNormOptions, options);
+}
+
+uint32_t CircleGen::addOperatorTranspose(const OperatorParams &params)
+{
+  auto options = circle::CreateTransposeOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_TRANSPOSE,
+                                circle::BuiltinOptions_TransposeOptions, options);
+}
+
 // NOTE Please add addOperator functions ABOVE this lie
 //
 // %  How to add a new addOperatorXXX fuction
@@ -233,6 +336,43 @@ flatbuffers::Offset<circle::Tensor> CircleGen::buildTensor(const TensorParams &p
                               0 /* shape_signature */);
 }
 
+flatbuffers::Offset<circle::SparsityParameters>
+CircleGen::buildSparsityParameters(const SparsityParams &sp)
+{
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order;
+  flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map;
+  flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
+      dim_metadata;
+
+  traversal_order = _fbb.CreateVector(sp.traversal_order);
+  block_map = _fbb.CreateVector(sp.block_map);
+
+  std::vector<flatbuffers::Offset<circle::DimensionMetadata>> dim_metadata_vec;
+  for (auto &it : sp.dim_metadata)
+  {
+    auto fb_array_segments = circle::CreateUint16VectorDirect(_fbb, &it._array_segments.u16);
+    auto fb_array_indices = circle::CreateUint16VectorDirect(_fbb, &it._array_indices.u16);
+    auto dim_metadata = circle::CreateDimensionMetadata(
+        _fbb, it._format, it._dense_size, it._array_segments_type, fb_array_segments.Union(),
+        it._array_indices_type, fb_array_indices.Union());
+    dim_metadata_vec.emplace_back(dim_metadata);
+  }
+  dim_metadata = _fbb.CreateVector(dim_metadata_vec);
+
+  return circle::CreateSparsityParameters(_fbb, traversal_order, block_map, dim_metadata);
+}
+
+flatbuffers::Offset<circle::Tensor> CircleGen::buildTensor(const TensorParams &params,
+                                                           const SparsityParams &sp)
+{
+  auto shape = _fbb.CreateVector(params.shape);
+  auto name = _fbb.CreateString(params.name);
+  auto sparsity = buildSparsityParameters(sp);
+  return circle::CreateTensor(_fbb, shape, params.tensor_type, params.buffer, name,
+                              0 /* QuantParam */, false /* is_variable */, sparsity,
+                              0 /* shape_signature */);
+}
+
 flatbuffers::Offset<circle::SubGraph> CircleGen::buildSubGraph(const SubgraphContext &ctx)
 {
   return circle::CreateSubGraphDirect(_fbb, &ctx.tensors, &ctx.inputs, &ctx.outputs, &ctx.operators,
diff --git a/tests/nnfw_api/src/CircleGen.h b/tests/nnfw_api/src/CircleGen.h
index 09ca5a5db..d72fb95ab 100644
--- a/tests/nnfw_api/src/CircleGen.h
+++ b/tests/nnfw_api/src/CircleGen.h
@@ -52,6 +52,47 @@ private:
 class CircleGen
 {
 public:
+  using Shape = std::vector<int32_t>;
+
+  using SparseIndexVectorType = circle::SparseIndexVector;
+  using SparseDimensionType = circle::DimensionType;
+
+  struct SparseIndexVector
+  {
+    std::vector<uint16_t> u16;
+  };
+
+  struct DimMetaData
+  {
+    DimMetaData() = delete;
+    DimMetaData(SparseDimensionType format, std::vector<uint16_t> array_segments,
+                std::vector<uint16_t> array_indices)
+        : _format{format},
+          _array_segments_type(SparseIndexVectorType::SparseIndexVector_Uint16Vector),
+          _array_indices_type(SparseIndexVectorType::SparseIndexVector_Uint16Vector)
+    {
+      _array_segments.u16 = array_segments;
+      _array_indices.u16 = array_indices;
+    }
+    DimMetaData(SparseDimensionType format, int32_t dense_size)
+        : _format{format}, _dense_size{dense_size}
+    {
+    }
+    SparseDimensionType _format{circle::DimensionType_DENSE};
+    int32_t _dense_size{0};
+    SparseIndexVectorType _array_segments_type{circle::SparseIndexVector_NONE};
+    SparseIndexVector _array_segments;
+    SparseIndexVectorType _array_indices_type{circle::SparseIndexVector_NONE};
+    SparseIndexVector _array_indices;
+  };
+
+  struct SparsityParams
+  {
+    std::vector<int32_t> traversal_order;
+    std::vector<int32_t> block_map;
+    std::vector<DimMetaData> dim_metadata;
+  };
+
   struct TensorParams
   {
     std::vector<int32_t> shape;
@@ -86,30 +127,52 @@ public:
   }
   uint32_t addBuffer(const uint8_t *buf, size_t size);
   uint32_t addTensor(const TensorParams &params);
+  uint32_t addTensor(const TensorParams &params, const SparsityParams &sp);
   void setInputsAndOutputs(const std::vector<int> &inputs, const std::vector<int> &outputs);
   uint32_t nextSubgraph();
   CircleBuffer finish();
 
-  // ===== Add Operator methods begin =====
+  // ===== Add Operator methods begin (SORTED IN ALPHABETICAL ORDER) =====
 
   uint32_t addOperatorAdd(const OperatorParams &params, circle::ActivationFunctionType actfn);
+  uint32_t addOperatorArgMax(const OperatorParams &params,
+                             circle::TensorType output_type = circle::TensorType::TensorType_INT32);
   uint32_t addOperatorAveragePool2D(const OperatorParams &params, circle::Padding padding,
                                     int stride_w, int stride_h, int filter_w, int filter_h,
                                     circle::ActivationFunctionType actfn);
+  uint32_t addOperatorCast(const OperatorParams &params, circle::TensorType input_type,
+                           circle::TensorType output_type);
   uint32_t addOperatorConcatenation(const OperatorParams &params, int axis,
                                     circle::ActivationFunctionType actfn);
   uint32_t addOperatorCos(const OperatorParams &params);
+  uint32_t addOperatorEqual(const OperatorParams &params);
+  uint32_t addOperatorFullyConnected(const OperatorParams &params);
+  uint32_t addOperatorIf(const OperatorParams &params, uint32_t then_subg, uint32_t else_subg);
+  uint32_t addOperatorInstanceNorm(const OperatorParams &params, float epsilon,
+                                   circle::ActivationFunctionType actfn);
   uint32_t addOperatorL2Normalization(const OperatorParams &params);
   uint32_t addOperatorLeakyRelu(const OperatorParams &params, float alpha);
   uint32_t addOperatorLess(const OperatorParams &params);
+  uint32_t addOperatorLogSoftmax(const OperatorParams &params);
   uint32_t addOperatorNeg(const OperatorParams &params);
+  uint32_t addOperatorOneHot(const OperatorParams &params, int32_t axis);
   uint32_t addOperatorPad(const OperatorParams &params);
   uint32_t addOperatorPadV2(const OperatorParams &params);
   uint32_t addOperatorRank(const OperatorParams &params);
+  uint32_t addOperatorReshape(const OperatorParams &params, const Shape &new_shape);
+  uint32_t addOperatorResizeBilinear(const OperatorParams &params, bool align_corners = false,
+                                     bool half_pixel_centers = false);
   uint32_t addOperatorResizeNearestNeighbor(const OperatorParams &params);
+  uint32_t addOperatorReverseV2(const OperatorParams &params);
+  uint32_t addOperatorSplit(const OperatorParams &params, int32_t num_split);
+  uint32_t addOperatorStridedSlice(const OperatorParams &params, int32_t begin_mask = 0,
+                                   int32_t end_mask = 0, int32_t ellipsis_mask = 0,
+                                   int32_t new_axis_mask = 0, int32_t shrink_axis_mask = 0);
+  uint32_t addOperatorTile(const OperatorParams &params);
+  uint32_t addOperatorTranspose(const OperatorParams &params);
   uint32_t addOperatorWhile(const OperatorParams &params, uint32_t cond_subg, uint32_t body_subg);
 
-  // NOTE Please add addOperator functions ABOVE this lie
+  // NOTE Please add addOperator functions ABOVE this line in ALPHABETICAL ORDER
   // ===== Add Operator methods end =====
 
 private:
@@ -119,6 +182,9 @@ private:
   uint32_t addOperatorCode(circle::BuiltinOperator opcode);
   flatbuffers::Offset<circle::Buffer> buildBuffer(const uint8_t *buf, size_t size);
   flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params);
+  flatbuffers::Offset<circle::SparsityParameters> buildSparsityParameters(const SparsityParams &sp);
+  flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params,
+                                                  const SparsityParams &sp);
   flatbuffers::Offset<circle::SubGraph> buildSubGraph(const SubgraphContext &ctx);
 
   SubgraphContext &curSubgCtx() { return _subgraph_contexts.back(); }
diff --git a/tests/nnfw_api/src/GenModelTest.h b/tests/nnfw_api/src/GenModelTest.h
index 530ccdd8c..a4c67a863 100644
--- a/tests/nnfw_api/src/GenModelTest.h
+++ b/tests/nnfw_api/src/GenModelTest.h
@@ -19,26 +19,121 @@
 
 #include <fstream>
 #include <string>
+#include <unordered_map>
 
 #include "CircleGen.h"
 #include "fixtures.h"
 
+inline size_t sizeOfNnfwType(NNFW_TYPE type)
+{
+  switch (type)
+  {
+    case NNFW_TYPE_TENSOR_BOOL:
+    case NNFW_TYPE_TENSOR_UINT8:
+    case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+      return 1;
+    case NNFW_TYPE_TENSOR_FLOAT32:
+    case NNFW_TYPE_TENSOR_INT32:
+      return 4;
+    case NNFW_TYPE_TENSOR_INT64:
+      return 8;
+    default:
+      throw std::runtime_error{"Invalid tensor type"};
+  }
+}
+
+// TODO Unify this with `SessionObject` in `fixtures.h`
+struct SessionObjectGeneric
+{
+  nnfw_session *session = nullptr;
+  std::vector<std::vector<uint8_t>> inputs;
+  std::vector<std::vector<uint8_t>> outputs;
+};
+
 struct TestCaseData
 {
   /**
    * @brief A vector of input buffers
-   *
-   * @todo support other types as well as float
    */
-  std::vector<std::vector<float>> inputs;
+  std::vector<std::vector<uint8_t>> inputs;
+
   /**
    * @brief A vector of output buffers
+   */
+  std::vector<std::vector<uint8_t>> outputs;
+
+  /**
+   * @brief Append vector data to inputs
    *
-   * @todo support other types as well as float
+   * @tparam T Data type
+   * @param data vector data array
+   */
+  template <typename T> void addInput(const std::vector<T> &data) { addData(inputs, data); }
+
+  /**
+   * @brief Append vector data to inputs
+   *
+   * @tparam T Data type
+   * @param data vector data array
+   */
+  template <typename T> void addOutput(const std::vector<T> &data) { addData(outputs, data); }
+
+  /**
+   * @brief Set @c True if @c NNFW_STATUS_ERROR is expected after calling @c nnfw_run() with
+   *        this test case; set @c False otherwise.
    */
-  std::vector<std::vector<float>> outputs;
+  void expect_error_on_run(bool expect_error_on_run) { _expect_error_on_run = expect_error_on_run; }
+  bool expect_error_on_run() const { return _expect_error_on_run; }
+
+private:
+  template <typename T>
+  static void addData(std::vector<std::vector<uint8_t>> &dest, const std::vector<T> &data)
+  {
+    size_t size = data.size() * sizeof(T);
+    dest.emplace_back();
+    dest.back().resize(size);
+    std::memcpy(dest.back().data(), data.data(), size);
+  }
+
+  bool _expect_error_on_run = false;
 };
 
+template <>
+inline void TestCaseData::addData<bool>(std::vector<std::vector<uint8_t>> &dest,
+                                        const std::vector<bool> &data)
+{
+  size_t size = data.size() * sizeof(uint8_t);
+  dest.emplace_back();
+  dest.back().resize(size);
+  std::transform(data.cbegin(), data.cend(), dest.back().data(),
+                 [](bool b) { return static_cast<uint8_t>(b); });
+}
+
+/**
+ * @brief Create a TestCaseData with a uniform type
+ *
+ * A helper function for generating test cases that has the same data type for model inputs/outputs.
+ *
+ * @tparam T Uniform tensor type
+ * @param inputs Inputs tensor buffers
+ * @param outputs Output tensor buffers
+ * @return TestCaseData Generated test case data
+ */
+template <typename T>
+static TestCaseData uniformTCD(const std::vector<std::vector<T>> &inputs,
+                               const std::vector<std::vector<T>> &outputs)
+{
+  TestCaseData ret;
+  for (const auto &data : inputs)
+    ret.addInput(data);
+  for (const auto &data : outputs)
+    ret.addOutput(data);
+  return ret;
+}
+
+/**
+ * @brief A test configuration class
+ */
 class GenModelTestContext
 {
 public:
@@ -66,11 +161,32 @@ public:
   const std::vector<std::string> &backends() const { return _backends; }
 
   /**
+   * @brief Return test is defined to fail on model load
+   *
+   * @return bool test is defined to fail on model load
+   */
+  bool expected_fail_model_load() const { return _expected_fail_model_load; }
+
+  /**
    * @brief Return test is defined to fail on compile
    *
    * @return bool test is defined to fail on compile
    */
-  const bool fail_compile() const { return _fail_compile; }
+  bool expected_fail_compile() const { return _expected_fail_compile; }
+
+  /**
+   * @brief Set the output buffer size of specified output tensor
+   *        Note that output tensor size of a model with dynamic tensor is calculated while
+   *        running the model.
+   *        Therefore, before runniing the model, the sufficient size of buffer should
+   *        be prepared by calling this method.
+   *        The size does not need to be the exact size.
+   */
+  void output_sizes(uint32_t ind, size_t size) { _output_sizes[ind] = size; }
+
+  size_t output_sizes(uint32_t ind) const { return _output_sizes.at(ind); }
+
+  bool hasOutputSizes(uint32_t ind) const { return _output_sizes.find(ind) != _output_sizes.end(); }
 
   /**
    * @brief Add a test case
@@ -104,15 +220,22 @@ public:
   }
 
   /**
-   * @brief Set the Test Fail
+   * @brief Expect failure while model load
    */
-  void setCompileFail() { _fail_compile = true; }
+  void expectFailModelLoad() { _expected_fail_model_load = true; }
+
+  /**
+   * @brief Expect failure while compiling
+   */
+  void expectFailCompile() { _expected_fail_compile = true; }
 
 private:
   CircleBuffer _cbuf;
   std::vector<TestCaseData> _test_cases;
   std::vector<std::string> _backends;
-  bool _fail_compile{false};
+  std::unordered_map<uint32_t, size_t> _output_sizes;
+  bool _expected_fail_model_load{false};
+  bool _expected_fail_compile{false};
 };
 
 /**
@@ -141,10 +264,19 @@ protected:
       //      nnfw_load_circle_from_buffer to outside forloop
       NNFW_ENSURE_SUCCESS(nnfw_create_session(&_so.session));
       auto &cbuf = _context->cbuf();
-      NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(_so.session, cbuf.buffer(), cbuf.size()));
+      auto model_load_result =
+          nnfw_load_circle_from_buffer(_so.session, cbuf.buffer(), cbuf.size());
+      if (_context->expected_fail_model_load())
+      {
+        ASSERT_NE(model_load_result, NNFW_STATUS_NO_ERROR);
+        std::cerr << "Failed model loading as expected." << std::endl;
+        NNFW_ENSURE_SUCCESS(nnfw_close_session(_so.session));
+        continue;
+      }
+      NNFW_ENSURE_SUCCESS(model_load_result);
       NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_so.session, backend.data()));
 
-      if (_context->fail_compile())
+      if (_context->expected_fail_compile())
       {
         ASSERT_EQ(nnfw_prepare(_so.session), NNFW_STATUS_ERROR);
 
@@ -162,11 +294,18 @@ protected:
         nnfw_tensorinfo ti;
         NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(_so.session, ind, &ti));
         uint64_t input_elements = num_elems(&ti);
-        _so.inputs[ind].resize(input_elements);
-
-        ASSERT_EQ(nnfw_set_input(_so.session, ind, ti.dtype, _so.inputs[ind].data(),
-                                 sizeof(float) * input_elements),
-                  NNFW_STATUS_NO_ERROR);
+        _so.inputs[ind].resize(input_elements * sizeOfNnfwType(ti.dtype));
+        if (_so.inputs[ind].size() == 0)
+        {
+          // Optional inputs
+          ASSERT_EQ(nnfw_set_input(_so.session, ind, ti.dtype, nullptr, 0), NNFW_STATUS_NO_ERROR);
+        }
+        else
+        {
+          ASSERT_EQ(nnfw_set_input(_so.session, ind, ti.dtype, _so.inputs[ind].data(),
+                                   _so.inputs[ind].size()),
+                    NNFW_STATUS_NO_ERROR);
+        }
       }
 
       uint32_t num_outputs;
@@ -176,10 +315,24 @@ protected:
       {
         nnfw_tensorinfo ti;
         NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_so.session, ind, &ti));
-        uint64_t output_elements = num_elems(&ti);
-        _so.outputs[ind].resize(output_elements);
+
+        auto size = 0;
+        {
+          if (_context->hasOutputSizes(ind))
+          {
+            size = _context->output_sizes(ind);
+          }
+          else
+          {
+            uint64_t output_elements = num_elems(&ti);
+            size = output_elements * sizeOfNnfwType(ti.dtype);
+          }
+          _so.outputs[ind].resize(size);
+        }
+
+        ASSERT_GT(_so.outputs[ind].size(), 0) << "Please make sure TC output is non-empty.";
         ASSERT_EQ(nnfw_set_output(_so.session, ind, ti.dtype, _so.outputs[ind].data(),
-                                  sizeof(float) * output_elements),
+                                  _so.outputs[ind].size()),
                   NNFW_STATUS_NO_ERROR);
       }
 
@@ -193,7 +346,13 @@ protected:
         {
           // Fill the values
           ASSERT_EQ(_so.inputs[i].size(), ref_inputs[i].size());
-          memcpy(_so.inputs[i].data(), ref_inputs[i].data(), _so.inputs[i].size() * sizeof(float));
+          memcpy(_so.inputs[i].data(), ref_inputs[i].data(), ref_inputs[i].size());
+        }
+
+        if (test_case.expect_error_on_run())
+        {
+          ASSERT_EQ(nnfw_run(_so.session), NNFW_STATUS_ERROR);
+          continue;
         }
 
         NNFW_ENSURE_SUCCESS(nnfw_run(_so.session));
@@ -201,12 +360,43 @@ protected:
         ASSERT_EQ(_so.outputs.size(), ref_outputs.size());
         for (uint32_t i = 0; i < _so.outputs.size(); i++)
         {
+          nnfw_tensorinfo ti;
+          NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_so.session, i, &ti));
+
           // Check output tensor values
           auto &ref_output = ref_outputs[i];
           auto &output = _so.outputs[i];
           ASSERT_EQ(output.size(), ref_output.size());
-          for (uint32_t e = 0; e < ref_output.size(); e++)
-            EXPECT_NEAR(ref_output[e], output[e], 0.001); // TODO better way for handling FP error?
+
+          switch (ti.dtype)
+          {
+            case NNFW_TYPE_TENSOR_BOOL:
+              compareBuffersExactBool(ref_output, output, i);
+              break;
+            case NNFW_TYPE_TENSOR_UINT8:
+              compareBuffersExact<uint8_t>(ref_output, output, i);
+              break;
+            case NNFW_TYPE_TENSOR_INT32:
+              compareBuffersExact<int32_t>(ref_output, output, i);
+              break;
+            case NNFW_TYPE_TENSOR_FLOAT32:
+              // TODO better way for handling FP error?
+              for (uint32_t e = 0; e < ref_output.size() / sizeof(float); e++)
+              {
+                float refval = reinterpret_cast<const float *>(ref_output.data())[e];
+                float val = reinterpret_cast<const float *>(output.data())[e];
+                EXPECT_NEAR(refval, val, 0.001) << "Output #" << i << ", Element Index : " << e;
+              }
+              break;
+            case NNFW_TYPE_TENSOR_INT64:
+              compareBuffersExact<int64_t>(ref_output, output, i);
+              break;
+            case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+              throw std::runtime_error{"NYI : comparison of tensors of QUANT8_ASYMM"};
+            default:
+              throw std::runtime_error{"Invalid tensor type"};
+          }
+          // TODO Add shape comparison
         }
       }
 
@@ -214,7 +404,33 @@ protected:
     }
   }
 
+private:
+  template <typename T>
+  void compareBuffersExact(const std::vector<uint8_t> &ref_buf, const std::vector<uint8_t> &act_buf,
+                           uint32_t index)
+  {
+    for (uint32_t e = 0; e < ref_buf.size() / sizeof(T); e++)
+    {
+      T ref = reinterpret_cast<const T *>(ref_buf.data())[e];
+      T act = reinterpret_cast<const T *>(act_buf.data())[e];
+      EXPECT_EQ(ref, act) << "Output #" << index << ", Element Index : " << e;
+    }
+  }
+
+  void compareBuffersExactBool(const std::vector<uint8_t> &ref_buf,
+                               const std::vector<uint8_t> &act_buf, uint32_t index)
+  {
+    for (uint32_t e = 0; e < ref_buf.size() / sizeof(uint8_t); e++)
+    {
+      uint8_t ref_raw = reinterpret_cast<const uint8_t *>(ref_buf.data())[e];
+      bool ref = (ref_raw != 0 ? true : false);
+      uint8_t act_raw = reinterpret_cast<const uint8_t *>(act_buf.data())[e];
+      bool act = (act_raw != 0 ? true : false);
+      EXPECT_EQ(ref, act) << "Output #" << index << ", Element Index : " << e;
+    }
+  }
+
 protected:
-  SessionObject _so;
+  SessionObjectGeneric _so;
   std::unique_ptr<GenModelTestContext> _context;
 };
diff --git a/tests/nnfw_api/src/GenModelTests.cc b/tests/nnfw_api/src/GenModelTests.cc
new file mode 100644
index 000000000..538da5dfa
--- /dev/null
+++ b/tests/nnfw_api/src/GenModelTests.cc
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file This file contains miscellaneous GenModelTest test cases.
+ *
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, UnusedConstOutputOnly)
+{
+  // A single tensor which is constant
+  CircleGen cgen;
+  uint32_t const_buf = cgen.addBuffer(std::vector<float>{9, 8, 7, 6});
+  int out_const = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32, const_buf});
+  cgen.setInputsAndOutputs({}, {out_const});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({}, {{9, 8, 7, 6}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, UnusedConstOutputAndAdd)
+{
+  // A single tensor which is constant + an Add op
+  CircleGen cgen;
+  uint32_t rhs_buf = cgen.addBuffer(std::vector<float>{5, 4, 7, 4});
+  uint32_t const_buf = cgen.addBuffer(std::vector<float>{9, 8, 7, 6});
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32, rhs_buf});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out_const = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32, const_buf});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs}, {out, out_const});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}}, {{6, 7, 9, 8}, {9, 8, 7, 6}}));
+  _context->addTestCase(uniformTCD<float>({{0, 1, 2, 3}}, {{5, 5, 9, 7}, {9, 8, 7, 6}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, UsedConstOutput)
+{
+  // (( Input 1 )) ---------\
+  //                         |=> [ Add ] -> (( Output 1 ))
+  // (( Const Output 2 )) --<
+  //                         |=> [ Add ] -> (( Output 0 ))
+  // (( Input 0 )) ---------/
+  CircleGen cgen;
+  uint32_t rhs_buf = cgen.addBuffer(std::vector<float>{6, 4, 8, 1});
+  int in0 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in1 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out0 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out1 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int const_out2 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32, rhs_buf});
+  cgen.addOperatorAdd({{in0, const_out2}, {out0}}, circle::ActivationFunctionType_NONE);
+  cgen.addOperatorAdd({{const_out2, in1}, {out1}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in0, in1}, {out0, out1, const_out2});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 1, 1, 1}, {-1, -1, -1, -1}},
+                                          {{7, 5, 9, 2}, {5, 3, 7, 0}, {6, 4, 8, 1}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, TensorBothInputOutput)
+{
+  // A single tensor which is an input and an output at the same time
+  CircleGen cgen;
+  int t = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.setInputsAndOutputs({t}, {t});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}}, {{1, 3, 2, 4}}));
+  _context->addTestCase(uniformTCD<float>({{100, 300, 200, 400}}, {{100, 300, 200, 400}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, TensorBothInputOutputCrossed)
+{
+  // Two tensors which are an input and an output at the same time
+  // But the order of inputs and outputs is changed.
+  CircleGen cgen;
+  int t1 = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int t2 = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.setInputsAndOutputs({t1, t2}, {t2, t1});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1}, {2}}, {{2}, {1}}));
+  _context->addTestCase(uniformTCD<float>({{100}, {200}}, {{200}, {100}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneTensor_TwoOutputs)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{2}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{2}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out, out}); // Same tensors are used twice as output
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 1}, {2, 2}}, {{3, 3}, {3, 3}}));
+  _context->addTestCase(uniformTCD<float>({{2, 4}, {7, 4}}, {{9, 8}, {9, 8}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneTensor_ThreeOutputs)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out, out, out}); // Same tensors are used 3 times as output
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1}, {2}}, {{3}, {3}, {3}}));
+  _context->addTestCase(uniformTCD<float>({{2}, {7}}, {{9}, {9}, {9}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneTensor_InputAndTwoOutputs)
+{
+  CircleGen cgen;
+  int t = cgen.addTensor({{2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.setInputsAndOutputs({t}, {t, t}); // Same tensor is an input and 2 outputs
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 1}}, {{1, 1}, {1, 1}}));
+  _context->addTestCase(uniformTCD<float>({{2, 4}}, {{2, 4}, {2, 4}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneTensor_InputAndTwoOutputsUsed)
+{
+  CircleGen cgen;
+  int t = cgen.addTensor({{2}, circle::TensorType::TensorType_FLOAT32});
+  int o = cgen.addTensor({{2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorNeg({{t}, {o}});
+  cgen.setInputsAndOutputs({t}, {t, t, o}); // Same tensor is an input and 2 outputs
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 1}}, {{1, 1}, {1, 1}, {-1, -1}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneTensor_ConstAndThreeOutputs)
+{
+  CircleGen cgen;
+  uint32_t const_buf = cgen.addBuffer(std::vector<float>{2, 5});
+  int t = cgen.addTensor({{2}, circle::TensorType_FLOAT32, const_buf});
+  cgen.setInputsAndOutputs({}, {t, t, t}); // A const tensor is 3 outputs
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({}, {{2, 5}, {2, 5}, {2, 5}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/ModelTestDynamicTensor.cc b/tests/nnfw_api/src/ModelTestDynamicTensor.cc
index e2d70d2c0..9a3a1add0 100644
--- a/tests/nnfw_api/src/ModelTestDynamicTensor.cc
+++ b/tests/nnfw_api/src/ModelTestDynamicTensor.cc
@@ -19,8 +19,8 @@
 
 #include "common.h"
 #include "fixtures.h"
-#include "NNPackages.h"
 #include "CircleGen.h"
+#include "GenModelTest.h"
 
 void set_input_output(nnfw_session *session, const std::vector<float> &input,
                       std::vector<float> &actual_output)
@@ -59,151 +59,120 @@ void set_input_output(nnfw_session *session, const std::vector<float> &input0,
  *
  * @note Run this test with "cpu" backend
  */
-// TODO Rewrite this with CircleGen
-class TestDynamicTensorReshapeModelLoaded
-    : public ValidationTestModelLoaded<NNPackages::DYNAMIC_TENSOR_RESHAPE>
+auto build_dynamic_Reshape()
 {
-protected:
-  void set_input_output(const std::vector<int> &new_shape, int actual_output_size,
-                        std::vector<float> *actual_output)
-  {
-    NNFW_STATUS res = nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_INT32, new_shape.data(),
-                                     sizeof(int) * new_shape.size());
-    NNFW_ENSURE_SUCCESS(res);
-
-    res = nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, actual_output->data(),
-                          sizeof(float) * actual_output_size);
-    NNFW_ENSURE_SUCCESS(res);
-  }
-
-  void prepare_and_set_input_output(const std::vector<int> &new_shape, int actual_output_size,
-                                    std::vector<float> *actual_output)
-  {
-    NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
+  CircleGen cgen;
 
-    NNFW_STATUS res = NNFW_STATUS_ERROR;
+  auto f32 = circle::TensorType::TensorType_FLOAT32;
+  auto i32 = circle::TensorType::TensorType_INT32;
 
-    res = nnfw_prepare(_session);
-    NNFW_ENSURE_SUCCESS(res);
+  std::vector<float> new_shape_data{-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
+  uint32_t input_buf = cgen.addBuffer(new_shape_data); // shape = [2, 3]
+  int input = cgen.addTensor({{2, 3}, f32, input_buf});
+  int new_shape = cgen.addTensor({{2}, i32});
+  int out = cgen.addTensor({{}, f32}); // scalar, meaning output shape is unspecified
 
-    set_input_output(new_shape, actual_output_size, actual_output);
-    // real test case should start from calling nnfw_run()
-  }
+  CircleGen::Shape empty_new_shape;
+  cgen.addOperatorReshape({{input, new_shape}, {out}}, empty_new_shape);
+  cgen.setInputsAndOutputs({new_shape}, {out});
+  auto cbuf = cgen.finish();
+  return cbuf;
+}
 
-  // call this after calling nnfw_prepare()
-  void set_input_output_and_run(const std::vector<int> &new_shape,
-                                const std::vector<float> &expected_output, bool no_run_error = true)
-  {
-    int output_element_num = expected_output.size();
-    std::vector<float> actual_output(output_element_num);
-
-    set_input_output(new_shape, output_element_num, &actual_output);
-
-    // Do inference
-    NNFW_STATUS res = nnfw_run(_session);
-
-    if (no_run_error)
-    {
-      NNFW_ENSURE_SUCCESS(res);
-
-      // output shape check
-      nnfw_tensorinfo info;
-      NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &info));
-      ASSERT_EQ(info.rank, new_shape.size());
-      for (uint32_t d = 0; d < info.rank; ++d)
-        ASSERT_EQ(info.dims[d], new_shape[d]);
-
-      // output value check
-      for (int i = 0; i < expected_output.size(); ++i)
-        ASSERT_EQ(expected_output[i], actual_output[i]);
-    }
-    else
-    {
-      ASSERT_EQ(res, NNFW_STATUS_ERROR);
-    }
-  };
+TEST_F(GenModelTest, dynamic_reshape_from_2x3_to_3x2)
+{
+  const std::vector<int> new_shape{3, 2};
+  const std::vector<float> expected{-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
 
-  void TearDown() override
+  _context = std::make_unique<GenModelTestContext>(build_dynamic_Reshape());
   {
-    ValidationTestModelLoaded<NNPackages::DYNAMIC_TENSOR_RESHAPE>::TearDown();
+    TestCaseData tcd;
+    tcd.addInput(new_shape);
+    tcd.addOutput(expected);
+    _context->addTestCase(tcd);
+    _context->setBackends({"cpu"}); // Currently, dynamic tensor runs on "cpu" only
+    _context->output_sizes(0, sizeof(float) * expected.size());
   }
-};
-
-TEST_F(TestDynamicTensorReshapeModelLoaded, reshape_to_3x2)
-{
-  const std::vector<int> new_shape = {3, 2};
-  const std::vector<float> expected = {-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
-  std::vector<float> actual_output(expected.size());
-
-  prepare_and_set_input_output(new_shape, expected.size(), &actual_output);
-
-  // Do inference
-  NNFW_STATUS res = nnfw_run(_session);
-  NNFW_ENSURE_SUCCESS(res);
-
-  // output value check
-  for (int i = 0; i < expected.size(); ++i)
-    ASSERT_EQ(expected[i], actual_output[i]);
+  // GenModelTest::teardown() will do the rest
+  SUCCEED();
 }
 
 /**
  * @brief Negative test.
  *        Reshape's first input has 6 values but trying to reshaping to [3, 3]
  */
-TEST_F(TestDynamicTensorReshapeModelLoaded, neg_reshape_to_wrong_3x3)
+TEST_F(GenModelTest, neg_reshape_from_2x3_to_wrong_3x3)
 {
-  const std::vector<int> wrong_shape = {3, 3}; // wrong shape input
-  const int actual_element_num = 9;            // whatever number
-  std::vector<float> actual_output(9);         // whatever size
+  const std::vector<int> wrong_shape{3, 3}; // wrong shape input
+  const std::vector<float> expected{0};     // whatever
 
-  prepare_and_set_input_output(wrong_shape, actual_element_num, &actual_output);
-
-  // Do inference
-  NNFW_STATUS res = nnfw_run(_session);
-  ASSERT_EQ(res, NNFW_STATUS_ERROR); // run should fail
+  _context = std::make_unique<GenModelTestContext>(build_dynamic_Reshape());
+  {
+    TestCaseData tcd;
+    tcd.addInput(wrong_shape);
+    tcd.addOutput(expected);
+    tcd.expect_error_on_run(true);
+
+    _context->addTestCase(tcd);
+    _context->setBackends({"cpu"}); // Currently, dynamic tensor runs on "cpu" only
+    _context->output_sizes(0, sizeof(float) * expected.size());
+  }
+  // GenModelTest::teardown() will do the rest
+  SUCCEED();
 }
 
-TEST_F(TestDynamicTensorReshapeModelLoaded, reshape_multiple_executions)
+TEST_F(GenModelTest, reshape_multiple_executions)
 {
-  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
-
-  NNFW_STATUS res = nnfw_prepare(_session);
-  NNFW_ENSURE_SUCCESS(res);
-
   std::vector<int> new_shape;
   std::vector<float> expected = {-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
 
-  // let's call multiple times
-  new_shape = {3, 2};
-  set_input_output_and_run(new_shape, expected);
+  auto add_tcd = [&](const decltype(new_shape) &&new_shape) {
+    TestCaseData tcd;
+    tcd.addInput(new_shape);
+    tcd.addOutput(expected);
+    _context->addTestCase(tcd);
+  };
 
-  new_shape = {1, 6};
-  set_input_output_and_run(new_shape, expected);
+  _context = std::make_unique<GenModelTestContext>(build_dynamic_Reshape());
+  {
+    add_tcd({3, 2});
+    add_tcd({1, 6});
+    add_tcd({6, 1});
 
-  new_shape = {6, 1};
-  set_input_output_and_run(new_shape, expected);
+    _context->setBackends({"cpu"}); // Currently, dynamic tensor runs on "cpu" only
+    _context->output_sizes(0, sizeof(float) * expected.size());
+  }
+  // GenModelTest::teardown() will do the rest
+  SUCCEED();
 }
 
-TEST_F(TestDynamicTensorReshapeModelLoaded, neg_reshape_multiple_executions)
+TEST_F(GenModelTest, neg_reshape_multiple_executions)
 {
-  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
-
-  NNFW_STATUS res = nnfw_prepare(_session);
-  NNFW_ENSURE_SUCCESS(res);
-
   std::vector<int> new_shape;
   std::vector<float> expected = {-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
 
-  // let's call multiple times including the second nnfw_run() to fail
-  new_shape = {3, 2};
-  set_input_output_and_run(new_shape, expected);
+  auto add_tcd = [&](const decltype(new_shape) &&new_shape, bool expect_error_on_run) {
+    TestCaseData tcd;
+    tcd.addInput(new_shape);
+    tcd.addOutput(expected);
+    tcd.expect_error_on_run(expect_error_on_run);
+    _context->addTestCase(tcd);
+  };
 
-  new_shape = {1, 100};                                 // wrong shape
-  set_input_output_and_run(new_shape, expected, false); // Run will fail
+  _context = std::make_unique<GenModelTestContext>(build_dynamic_Reshape());
+  {
+    bool EXPECT_ERROR_ON_RUN = true;
+    bool EXPECT_SUCCESS_ON_RUN = !EXPECT_ERROR_ON_RUN;
 
-  // next run should succeed
-  new_shape = {6, 1};
-  set_input_output_and_run(new_shape, expected);
+    add_tcd({3, 2}, EXPECT_SUCCESS_ON_RUN);
+    add_tcd({1, 100}, EXPECT_ERROR_ON_RUN); // 1th tcd. wrong shape
+    add_tcd({6, 1}, EXPECT_SUCCESS_ON_RUN);
+
+    _context->setBackends({"cpu"}); // Currently, dynamic tensor runs on "cpu" only
+    _context->output_sizes(0, sizeof(float) * expected.size());
+  }
+  // GenModelTest::teardown() will do the rest
+  SUCCEED();
 }
 
 //
diff --git a/tests/nnfw_api/src/ModelTestInputReshaping.cc b/tests/nnfw_api/src/ModelTestInputReshaping.cc
index bfe347fe7..f5ce3e062 100644
--- a/tests/nnfw_api/src/ModelTestInputReshaping.cc
+++ b/tests/nnfw_api/src/ModelTestInputReshaping.cc
@@ -18,25 +18,37 @@
 #include <nnfw_internal.h>
 
 #include "fixtures.h"
-#include "NNPackages.h"
 #include "common.h"
-
-using TestInputReshapingAddModelLoaded = ValidationTestModelLoaded<NNPackages::INPUT_RESHAPING_ADD>;
+#include "CircleGen.h"
 
 /**
  * @brief Testing the following model:
  *       #1 = placeholder (shape = [2, 2], dtype=float)
  *       #2 = placeholder (shape = [2], dtype=float)
  *       #3 = add(#1, #2)
- *
- * @note Run this test with "cpu" backend and "linear" executor
  */
-TEST_F(TestInputReshapingAddModelLoaded, reshaping_2x2_to_4x2)
+auto build_model_add_input_reshaping()
+{
+  // Model is not important
+  CircleGen cgen;
+  auto f32 = circle::TensorType::TensorType_FLOAT32;
+  int in1 = cgen.addTensor({{2, 2}, f32}); // consider this [None, None]
+  int in2 = cgen.addTensor({{2}, f32});
+  int out = cgen.addTensor({{}, f32}); // scalar, meaning output shape is unspecified
+  cgen.addOperatorAdd({{in1, in2}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in1, in2}, {out});
+  auto cbuf = cgen.finish();
+  return cbuf;
+}
+
+TEST(TestDynamicTensor, input_reshaping)
 {
-  NNFW_STATUS res = NNFW_STATUS_ERROR;
+  nnfw_session *session = nullptr;
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&session));
+  const auto model_buf = build_model_add_input_reshaping();
+  NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(session, model_buf.buffer(), model_buf.size()));
 
-  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
-  NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "EXECUTOR", "Linear"));
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(session, "cpu"));
 
   // input and output values
   const std::vector<float> input1 = {0, 1, 2, 3, 4, 5, 6, 7}; // of changed shape [4, 2]
@@ -53,36 +65,31 @@ TEST_F(TestInputReshapingAddModelLoaded, reshaping_2x2_to_4x2)
 
   // input reshaping from [2, 2] to [4, 2]
   nnfw_tensorinfo ti = {NNFW_TYPE_TENSOR_FLOAT32, 2, {4, 2}};
-  res = nnfw_set_input_tensorinfo(_session, 0, &ti);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(session, 0, &ti));
 
-  res = nnfw_prepare(_session);
-  NNFW_ENSURE_SUCCESS(res);
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(session));
 
   nnfw_tensorinfo ti_input = {}; // Static inference result will be stored
-  nnfw_input_tensorinfo(_session, 0, &ti_input);
+  NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(session, 0, &ti_input));
   ASSERT_TRUE(tensorInfoEqual(ti, ti_input));
 
   nnfw_tensorinfo ti_output = {}; // Static inference result will be stored
-  nnfw_output_tensorinfo(_session, 0, &ti_output);
+  NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(session, 0, &ti_output));
   ASSERT_TRUE(tensorInfoEqual(ti, ti_output)); // input/output shapes are same with for this model
 
-  res = nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, input1.data(),
-                       sizeof(float) * input1.size());
-  NNFW_ENSURE_SUCCESS(res);
-  res = nnfw_set_input(_session, 1, NNFW_TYPE_TENSOR_FLOAT32, input2.data(),
-                       sizeof(float) * input2.size());
-  NNFW_ENSURE_SUCCESS(res);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(session, 0, NNFW_TYPE_TENSOR_FLOAT32, input1.data(),
+                                     sizeof(float) * input1.size()));
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(session, 1, NNFW_TYPE_TENSOR_FLOAT32, input2.data(),
+                                     sizeof(float) * input2.size()));
 
   uint64_t output_num_elements = tensorInfoNumElements(ti_output);
   ASSERT_EQ(output_num_elements, expected.size());
   std::vector<float> actual_output(output_num_elements);
-  res = nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, actual_output.data(),
-                        sizeof(float) * actual_output.size());
-  NNFW_ENSURE_SUCCESS(res);
+  NNFW_ENSURE_SUCCESS(nnfw_set_output(session, 0, NNFW_TYPE_TENSOR_FLOAT32, actual_output.data(),
+                                      sizeof(float) * actual_output.size()));
 
   // Do inference
-  res = nnfw_run(_session);
-  NNFW_ENSURE_SUCCESS(res);
+  NNFW_ENSURE_SUCCESS(nnfw_run(session));
 
   // compare
   for (int i = 0; i < expected.size(); ++i)
diff --git a/tests/nnfw_api/src/NNPackages.cc b/tests/nnfw_api/src/NNPackages.cc
index 534973cb0..b0febb7a1 100644
--- a/tests/nnfw_api/src/NNPackages.cc
+++ b/tests/nnfw_api/src/NNPackages.cc
@@ -29,7 +29,7 @@ const char *TEST_PACKAGE_NAMES[] = {
     "add", "add_no_manifest", "add_invalid_manifest",
 
     // for dynamic tensor test
-    "input_reshaping_add", "dynamic_tensor_reshape", "while_dynamic", "if_dynamic",
+    "while_dynamic", "if_dynamic",
 };
 
 NNPackages &NNPackages::get()
@@ -71,7 +71,10 @@ void NNPackages::checkAll()
     DIR *dir = opendir(path.c_str());
     if (!dir)
     {
-      std::string msg = "missing nnpackage: " + package_name + ", path: " + path;
+      std::string msg =
+          "missing nnpackage: " + package_name + ", path: " + path +
+          "\nPlease run \'[install_dir]/test/onert-test prepare-model --nnpackage\' to "
+          "download nnpackage";
       throw std::runtime_error{msg};
     }
     closedir(dir);
diff --git a/tests/nnfw_api/src/NNPackages.h b/tests/nnfw_api/src/NNPackages.h
index 735fa96a0..a51b7701d 100644
--- a/tests/nnfw_api/src/NNPackages.h
+++ b/tests/nnfw_api/src/NNPackages.h
@@ -43,8 +43,6 @@ public:
     ADD_INVALID_MANIFEST, //< Contains "Add" model but the manifest file is broken JSON
 
     // for dynamic tensor test
-    INPUT_RESHAPING_ADD,
-    DYNAMIC_TENSOR_RESHAPE,
     WHILE_DYNAMIC,
     IF_DYNAMIC,
 
diff --git a/tests/nnfw_api/src/RegressionTests.cc b/tests/nnfw_api/src/RegressionTests.cc
index 05914b839..10d6e5d6e 100644
--- a/tests/nnfw_api/src/RegressionTests.cc
+++ b/tests/nnfw_api/src/RegressionTests.cc
@@ -62,3 +62,117 @@ TEST_F(RegressionTest, neg_github_3826)
   ASSERT_EQ(nnfw_prepare(session), NNFW_STATUS_ERROR);
   NNFW_ENSURE_SUCCESS(nnfw_close_session(session));
 }
+
+TEST_F(RegressionTest, github_11748)
+{
+  // At the 1st call, input tensor is static. From the 2nd call, input tensor becomes dynamic.
+  // the following model and calling sequence were what nnstreamer people used for their test case.
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+
+  std::vector<float> rhs_data{2};
+  uint32_t rhs_buf = cgen.addBuffer(rhs_data);
+  int rhs = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, rhs_buf});
+
+  int out = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs}, {out});
+  auto cbuf = cgen.finish();
+
+  nnfw_session *session = nullptr;
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&session));
+  NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(session, cbuf.buffer(), cbuf.size()));
+  // To test when there is no backends loaded for the session
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(session, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(session));
+
+  uint32_t input_num = -1;
+  NNFW_ENSURE_SUCCESS(nnfw_input_size(session, &input_num));
+
+  nnfw_tensorinfo t_input;
+  NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(session, 0, &t_input));
+
+  uint32_t output_num = -1;
+  NNFW_ENSURE_SUCCESS(nnfw_output_size(session, &output_num));
+
+  nnfw_tensorinfo t_output;
+  NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(session, 0, &t_output));
+
+  // when new_dim == 1, input tensor is static. From 2, input tensor becomes dynamic.
+  for (int32_t new_dim = 1; new_dim <= 4; new_dim++)
+  {
+    nnfw_tensorinfo t_new_input;
+    t_new_input.dtype = t_input.dtype;
+    t_new_input.rank = 1;
+    t_new_input.dims[0] = new_dim;
+    NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(session, 0, &t_new_input));
+
+    NNFW_ENSURE_SUCCESS(nnfw_input_size(session, &input_num));
+    NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(session, 0, &t_input));
+
+    ASSERT_EQ(input_num, 1);
+    ASSERT_EQ(t_input.rank, t_new_input.rank);
+    ASSERT_EQ(t_input.dims[0], new_dim);
+
+    uint8_t input_buf[new_dim * sizeof(float)];
+    NNFW_ENSURE_SUCCESS(
+        nnfw_set_input(session, 0, t_input.dtype, &input_buf, new_dim * sizeof(float)));
+
+    uint8_t output_buf[new_dim * sizeof(float)];
+    NNFW_ENSURE_SUCCESS(
+        nnfw_set_output(session, 0, t_output.dtype, &output_buf, new_dim * sizeof(float)));
+
+    NNFW_ENSURE_SUCCESS(nnfw_run(session));
+
+    NNFW_ENSURE_SUCCESS(nnfw_output_size(session, &output_num));
+    NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(session, 0, &t_output));
+
+    ASSERT_EQ(output_num, 1);
+    ASSERT_EQ(t_output.rank, t_new_input.rank);
+    ASSERT_EQ(t_output.dims[0], new_dim);
+
+    // seems weird calling but anyway nnstreamer people case calls this again.
+    // Anyways, runtime should work
+    NNFW_ENSURE_SUCCESS(
+        nnfw_set_input(session, 0, t_input.dtype, &input_buf, new_dim * sizeof(float)));
+    NNFW_ENSURE_SUCCESS(
+        nnfw_set_output(session, 0, t_output.dtype, &output_buf, new_dim * sizeof(float)));
+    NNFW_ENSURE_SUCCESS(nnfw_run(session));
+  }
+
+  NNFW_ENSURE_SUCCESS(nnfw_close_session(session));
+}
+
+TEST_F(RegressionTest, github_4585)
+{
+  // A single tensor which is an input and an output at the same time
+  CircleGen cgen;
+  int t = cgen.addTensor({{1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.setInputsAndOutputs({t}, {t});
+  auto cbuf = cgen.finish();
+
+  nnfw_session *session = nullptr;
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&session));
+  NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(session, cbuf.buffer(), cbuf.size()));
+  // To test when there is no backends loaded for the session
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(session, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(session));
+
+  // Change input tensorinfo (Make dynamic shape inference happen)
+  nnfw_tensorinfo ti_new = {NNFW_TYPE_TENSOR_FLOAT32, 2, {1, 2}};
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(session, 0, &ti_new));
+
+  std::vector<float> in_buf{1, 1};
+  std::vector<float> out_buf{-1, -1};
+
+  NNFW_ENSURE_SUCCESS(
+      nnfw_set_input(session, 0, ti_new.dtype, in_buf.data(), in_buf.size() * sizeof(float)));
+  NNFW_ENSURE_SUCCESS(
+      nnfw_set_output(session, 0, ti_new.dtype, out_buf.data(), out_buf.size() * sizeof(float)));
+
+  NNFW_ENSURE_SUCCESS(nnfw_run(session));
+
+  ASSERT_EQ(in_buf, out_buf);
+
+  NNFW_ENSURE_SUCCESS(nnfw_close_session(session));
+}
diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
index f19bb782c..e2ae655be 100644
--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
@@ -39,6 +39,28 @@ TEST_F(ValidationTestAddSessionPrepared, run_twice)
   ASSERT_FLOAT_EQ(_output[0], 7.0);
 }
 
+TEST_F(ValidationTestAddSessionPrepared, run_many_times_dynamic_input)
+{
+  for (int v = 1; v <= 5; v++) // 5 times with different shapes
+  {
+    nnfw_tensorinfo ti_input = {NNFW_TYPE_TENSOR_FLOAT32, 4, {1, 1, 1, v}};
+    SetInOutBuffersDynamic(&ti_input);
+
+    for (int i = 0; i < v; i++)
+      _input[i] = i * 10.0;
+
+    NNFW_ENSURE_SUCCESS(nnfw_run(_session));
+
+    // Check if the shape inference is correct
+    nnfw_tensorinfo ti_output;
+    ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &ti_output), NNFW_STATUS_NO_ERROR);
+    EXPECT_EQ(num_elems(&ti_input), num_elems(&ti_output));
+
+    for (int i = 0; i < v; i++)
+      ASSERT_FLOAT_EQ(_output[i], i * 10.0 + 2.0) << "i : " << i;
+  }
+}
+
 TEST_F(ValidationTestAddSessionPrepared, run_async)
 {
   SetInOutBuffers();
@@ -161,4 +183,12 @@ TEST_F(ValidationTestAddSessionPrepared, neg_prepare)
   ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
+TEST_F(ValidationTestAddSessionPrepared, neg_run_without_set_output)
+{
+  uint8_t input[4];
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, input, sizeof(input)));
+  // `nnfw_set_output()` is not called
+  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+}
+
 // TODO Validation check when "nnfw_run" is called without input & output tensor setting
diff --git a/tests/nnfw_api/src/fixtures.h b/tests/nnfw_api/src/fixtures.h
index f273d6553..0cb67b5e2 100644
--- a/tests/nnfw_api/src/fixtures.h
+++ b/tests/nnfw_api/src/fixtures.h
@@ -126,6 +126,21 @@ protected:
               NNFW_STATUS_NO_ERROR);
   }
 
+  void SetInOutBuffersDynamic(const nnfw_tensorinfo *ti_input)
+  {
+    NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, ti_input));
+    uint64_t input_elements = num_elems(ti_input);
+    _input.resize(input_elements);
+    ASSERT_EQ(
+        nnfw_set_input(_session, 0, ti_input->dtype, _input.data(), sizeof(float) * input_elements),
+        NNFW_STATUS_NO_ERROR);
+
+    _output.resize(40000); // Give sufficient size for the output
+    ASSERT_EQ(nnfw_set_output(_session, 0, ti_input->dtype, _output.data(),
+                              sizeof(float) * _output.size()),
+              NNFW_STATUS_NO_ERROR);
+  }
+
 protected:
   std::vector<float> _input;
   std::vector<float> _output;
diff --git a/tests/nnfw_api/src/one_op_tests/Add.cc b/tests/nnfw_api/src/one_op_tests/Add.cc
index 281d5ded5..1fff0ed30 100644
--- a/tests/nnfw_api/src/one_op_tests/Add.cc
+++ b/tests/nnfw_api/src/one_op_tests/Add.cc
@@ -30,8 +30,8 @@ TEST_F(GenModelTest, OneOp_Add_VarToConst)
   cgen.setInputsAndOutputs({lhs}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{1, 3, 2, 4}}, {{6, 7, 9, 8}}});
-  _context->addTestCase({{{0, 1, 2, 3}}, {{5, 5, 9, 7}}});
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}}, {{6, 7, 9, 8}}));
+  _context->addTestCase(uniformTCD<float>({{0, 1, 2, 3}}, {{5, 5, 9, 7}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -47,7 +47,22 @@ TEST_F(GenModelTest, OneOp_Add_VarToVar)
   cgen.setInputsAndOutputs({lhs, rhs}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{1, 3, 2, 4}, {5, 4, 7, 4}}, {{6, 7, 9, 8}}});
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}, {5, 4, 7, 4}}, {{6, 7, 9, 8}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Add_VarToVarSame)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{in, in}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}}, {{2, 6, 4, 8}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -64,7 +79,7 @@ TEST_F(GenModelTest, neg_OneOp_Add_InvalidShape)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
@@ -82,7 +97,7 @@ TEST_F(GenModelTest, neg_OneOp_Add_InvalidShapeConst)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
@@ -97,7 +112,55 @@ TEST_F(GenModelTest, neg_OneOp_Add_OneOperand)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Add_ThreeOperands)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{in, in, in}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Add_NoOutput)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{in}, {}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Add_InvalidActivation)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}},
+                      static_cast<circle::ActivationFunctionType>(128) /* Invalid value*/);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}, {5, 4, 7, 4}}, {{6, 7, 9, 8}}));
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/ArgMax.cc b/tests/nnfw_api/src/one_op_tests/ArgMax.cc
new file mode 100644
index 000000000..2876d8d70
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/ArgMax.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_ArgMax_AxisToConst)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 4, 2, 3});
+  tcd.addOutput(std::vector<int32_t>{1, 0});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_ArgMax_Int64_AxisToConst)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT64;
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 4, 2, 3});
+  tcd.addOutput(std::vector<int64_t>{1, 0});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_ArgMax_AxisToVar)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in, axis}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 4, 2, 3});
+  tcd.addInput(std::vector<int32_t>{-3});
+  tcd.addOutput(std::vector<int32_t>{1, 0});
+  _context->addTestCase(tcd);
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis0)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{4};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis1)
+{
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{-3};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2}, output_type});
+  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc b/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
index 854517e47..8ba82083f 100644
--- a/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
@@ -26,7 +26,7 @@ TEST_F(GenModelTest, OneOp_AvgPool2D)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{1, 3, 2, 4}}, {{2.5}}});
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}}, {{2.5}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -43,7 +43,82 @@ TEST_F(GenModelTest, neg_OneOp_AvgPool2D)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidPaddingType)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAveragePool2D({{in}, {out}}, static_cast<circle::Padding>(99), 2, 2, 2, 2,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidFilterSize_1)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, -1, 2,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidFilterSize_2)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, 2, 0,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidStrides_1)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 0, 2, 2, 2,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidStrides_2)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 1, -100, 2, 2,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/Cast.cc b/tests/nnfw_api/src/one_op_tests/Cast.cc
new file mode 100644
index 000000000..71d98ee59
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Cast.cc
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_Cast_Int32ToFloat32)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorCast({{in}, {out}}, circle::TensorType::TensorType_INT32,
+                       circle::TensorType::TensorType_FLOAT32);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{1, 2, 3, 4});
+  tcd.addOutput(std::vector<float>{1, 2, 3, 4});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Cast_Float32ToInt32)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  cgen.addOperatorCast({{in}, {out}}, circle::TensorType::TensorType_FLOAT32,
+                       circle::TensorType::TensorType_INT32);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 2, 3, 4});
+  tcd.addOutput(std::vector<int32_t>{1, 2, 3, 4});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Cast_BoolToFloat32)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_BOOL});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorCast({{in}, {out}}, circle::TensorType::TensorType_BOOL,
+                       circle::TensorType::TensorType_FLOAT32);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<bool>{true, false, true, true});
+  tcd.addOutput(std::vector<float>{1, 0, 1, 1});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Cast_AfterEqual)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int equal_out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_BOOL});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorEqual({{lhs, rhs}, {equal_out}});
+  cgen.addOperatorCast({{equal_out}, {out}}, circle::TensorType::TensorType_BOOL,
+                       circle::TensorType::TensorType_FLOAT32);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}, {2, 3, 1, 4}}, {{0, 1, 0, 1}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Cast_InvalidInputCount0)
+{
+  CircleGen cgen;
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  cgen.addOperatorCast({{}, {out}}, circle::TensorType::TensorType_FLOAT32,
+                       circle::TensorType::TensorType_INT32);
+  cgen.setInputsAndOutputs({}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Cast_InvalidInputCount2)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  int out = cgen.addTensor({{1, 2, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorCast({{lhs, rhs}, {out}}, circle::TensorType::TensorType_INT32,
+                       circle::TensorType::TensorType_FLOAT32);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Cast_InvalidOutputCount0)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  cgen.addOperatorCast({{in}, {}}, circle::TensorType::TensorType_INT32,
+                       circle::TensorType::TensorType_FLOAT32);
+  cgen.setInputsAndOutputs({in}, {});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Cast_InvalidOutputCount2)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  int out1 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out2 = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  cgen.addOperatorCast({{in}, {out1, out2}}, circle::TensorType::TensorType_INT32,
+                       circle::TensorType::TensorType_FLOAT32);
+  cgen.setInputsAndOutputs({in}, {out1, out2});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Concat.cc b/tests/nnfw_api/src/one_op_tests/Concat.cc
new file mode 100644
index 000000000..2ef1185d4
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Concat.cc
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_Concat_ShareSubTensor)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int shared_subtensor = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int concat_out = cgen.addTensor({{1, 2, 2, 2}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> padding_data{0, 0, 1, 1, 1, 1, 0, 0};
+  uint32_t padding_buf = cgen.addBuffer(padding_data);
+  int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
+  int pad_out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {shared_subtensor}}, circle::ActivationFunctionType_NONE);
+  cgen.addOperatorConcatenation({{rhs, shared_subtensor}, {concat_out}}, 3,
+                                circle::ActivationFunctionType_NONE);
+  cgen.addOperatorPad({{shared_subtensor, padding}, {pad_out}});
+  cgen.setInputsAndOutputs({lhs, rhs}, {pad_out, concat_out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+      {{1, 3, 2, 4}, {5, 4, 7, 4}},
+      {{0, 0, 0, 0, 0, 6, 7, 0, 0, 9, 8, 0, 0, 0, 0, 0}, {5, 6, 4, 7, 7, 9, 4, 8}}));
+  _context->setBackends({"acl_cl", "acl_neon"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Concat)
+{
+  CircleGen cgen;
+
+  int input1 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int input2 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int output = cgen.addTensor({{4, 3}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorConcatenation({{input1, input2}, {output}}, 0,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({input1, input2}, {output});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}},
+                                          {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}));
+  _context->setBackends({"cpu;acl_cl"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Concat_Subtensor_4D)
+{
+  CircleGen cgen;
+  int in1 = cgen.addTensor({{1, 1, 1, 20}, circle::TensorType::TensorType_FLOAT32});
+  int in2 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> axis_data{3};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+
+  int s_out1 = cgen.addTensor({{1, 1, 1, 5}, circle::TensorType::TensorType_FLOAT32});
+  int s_out2 = cgen.addTensor({{1, 1, 1, 5}, circle::TensorType::TensorType_FLOAT32});
+  int s_out3 = cgen.addTensor({{1, 1, 1, 5}, circle::TensorType::TensorType_FLOAT32});
+  int s_out4 = cgen.addTensor({{1, 1, 1, 5}, circle::TensorType::TensorType_FLOAT32});
+
+  int c_out1 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+  int c_out2 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+  int c_out3 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+
+  int a_out1 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+  int a_out2 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+  int a_out3 = cgen.addTensor({{1, 1, 1, 10}, circle::TensorType::TensorType_FLOAT32});
+
+  int final_out = cgen.addTensor({{1, 1, 1, 35}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorSplit({{axis, in1}, {s_out1, s_out2, s_out3, s_out4}}, 4);
+
+  cgen.addOperatorConcatenation({{s_out1, s_out2}, {c_out1}}, 3,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorConcatenation({{s_out1, s_out3}, {c_out2}}, 3,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorConcatenation({{s_out1, s_out4}, {c_out3}}, 3,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+
+  cgen.addOperatorAdd({{c_out1, in2}, {a_out1}},
+                      circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorAdd({{c_out2, in2}, {a_out2}},
+                      circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorAdd({{c_out3, in2}, {a_out3}},
+                      circle::ActivationFunctionType::ActivationFunctionType_NONE);
+
+  cgen.addOperatorConcatenation({{s_out1, a_out1, a_out2, a_out3}, {final_out}}, 3,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+
+  cgen.setInputsAndOutputs({in1, in2}, {s_out1, s_out2, s_out3, s_out4, c_out1, c_out2, c_out3,
+                                        a_out1, a_out2, a_out3, final_out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+      {
+          // inputs
+          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, // in1
+          {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}                                           // in2
+      },
+      {
+          // outputs
+          {1, 2, 3, 4, 5},                     // s_out1
+          {6, 7, 8, 9, 10},                    // s_out2
+          {11, 12, 13, 14, 15},                // s_out3
+          {16, 17, 18, 19, 20},                // s_out4
+          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},     // c_out1
+          {1, 2, 3, 4, 5, 11, 12, 13, 14, 15}, // c_out2
+          {1, 2, 3, 4, 5, 16, 17, 18, 19, 20}, // c_out3
+          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},     // a_out1
+          {1, 2, 3, 4, 5, 11, 12, 13, 14, 15}, // a_out2
+          {1, 2, 3, 4, 5, 16, 17, 18, 19, 20}, // a_out3
+          {1, 2, 3,  4,  5,  1,  2,  3, 4, 5, 6, 7, 8,  9,  10, 1,  2, 3,
+           4, 5, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20} // final_out
+      }));
+  _context->setBackends({"acl_cl"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Concat_Subtensor_2D)
+{
+  CircleGen cgen;
+  int in1 = cgen.addTensor({{1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int in2 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+
+  int s_out1 = cgen.addTensor({{1, 1}, circle::TensorType::TensorType_FLOAT32});
+  int s_out2 = cgen.addTensor({{1, 1}, circle::TensorType::TensorType_FLOAT32});
+  int s_out3 = cgen.addTensor({{1, 1}, circle::TensorType::TensorType_FLOAT32});
+  int s_out4 = cgen.addTensor({{1, 1}, circle::TensorType::TensorType_FLOAT32});
+
+  int c_out1 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+  int c_out2 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+  int c_out3 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+
+  int a_out1 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+  int a_out2 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+  int a_out3 = cgen.addTensor({{1, 2}, circle::TensorType::TensorType_FLOAT32});
+
+  int final_out = cgen.addTensor({{1, 7}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorSplit({{axis, in1}, {s_out1, s_out2, s_out3, s_out4}}, 4);
+
+  cgen.addOperatorConcatenation({{s_out1, s_out2}, {c_out1}}, 1,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorConcatenation({{s_out1, s_out3}, {c_out2}}, 1,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorConcatenation({{s_out1, s_out4}, {c_out3}}, 1,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+
+  cgen.addOperatorAdd({{c_out1, in2}, {a_out1}},
+                      circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorAdd({{c_out2, in2}, {a_out2}},
+                      circle::ActivationFunctionType::ActivationFunctionType_NONE);
+  cgen.addOperatorAdd({{c_out3, in2}, {a_out3}},
+                      circle::ActivationFunctionType::ActivationFunctionType_NONE);
+
+  cgen.addOperatorConcatenation({{s_out1, a_out1, a_out2, a_out3}, {final_out}}, 1,
+                                circle::ActivationFunctionType::ActivationFunctionType_NONE);
+
+  cgen.setInputsAndOutputs({in1, in2}, {s_out1, s_out2, s_out3, s_out4, c_out1, c_out2, c_out3,
+                                        a_out1, a_out2, a_out3, final_out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+      {
+          // inputs
+          {1, 2, 3, 4}, // in1
+          {0, 0}        // in2
+      },
+      {
+          // outputs
+          {1},                  // s_out1
+          {2},                  // s_out2
+          {3},                  // s_out3
+          {4},                  // s_out4
+          {1, 2},               // c_out1
+          {1, 3},               // c_out2
+          {1, 4},               // c_out3
+          {1, 2},               // a_out1
+          {1, 3},               // a_out2
+          {1, 4},               // a_out3
+          {1, 1, 2, 1, 3, 1, 4} // final_out
+      }));
+  _context->setBackends({"acl_cl"});
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Cos.cc b/tests/nnfw_api/src/one_op_tests/Cos.cc
index 72bfe3e2f..03944746a 100644
--- a/tests/nnfw_api/src/one_op_tests/Cos.cc
+++ b/tests/nnfw_api/src/one_op_tests/Cos.cc
@@ -26,7 +26,7 @@ TEST_F(GenModelTest, OneOp_Cos)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   const float pi = 3.141592653589793;
-  _context->addTestCase({{{0, pi / 2, pi, 7}}, {{1, 0, -1, 0.75390225434}}});
+  _context->addTestCase(uniformTCD<float>({{0, pi / 2, pi, 7}}, {{1, 0, -1, 0.75390225434}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -44,7 +44,7 @@ TEST_F(GenModelTest, neg_OneOp_Cos_TwoOperand)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"cpu"});
-  _context->setCompileFail();
+  _context->expectFailModelLoad();
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/Equal.cc b/tests/nnfw_api/src/one_op_tests/Equal.cc
new file mode 100644
index 000000000..54dcbee12
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Equal.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_Equal)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_BOOL});
+  cgen.addOperatorEqual({{lhs, rhs}, {out}});
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{0.1, 0.3, 0.5, 0.7});
+  tcd.addInput(std::vector<float>{0.1, 0.2, 0.3, 0.4});
+  tcd.addOutput(std::vector<bool>{true, false, false, false});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Equal_DifferentType)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_BOOL});
+  cgen.addOperatorEqual({{lhs, rhs}, {out}});
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Equal_InvalidType)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT32});
+  cgen.addOperatorEqual({{lhs, rhs}, {out}});
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/FullyConnected.cc b/tests/nnfw_api/src/one_op_tests/FullyConnected.cc
new file mode 100644
index 000000000..58bc830ef
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/FullyConnected.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_FullyConnected)
+{
+  CircleGen cgen;
+  // clang-format off
+  std::vector<float> weight_data{ 1, 0, 0, 1,
+                                  2, 0, 0, -1,
+                                  3, 0, 0, 2,
+                                  4, 0, 0, 1,
+                                  1, 0, 0, 1,
+                                  2, 0, 0, -1,
+                                  3, 0, 0, 2,
+                                  4, 0, 0, 1,
+                                  1, 0, 0, 1,
+                                  2, 0, 0, -1,
+                                  3, 0, 0, 2,
+                                  4, 0, 0, 1,
+                                  1, 0, 0, 1,
+                                  2, 0, 0, -1,
+                                  3, 0, 0, 2,
+                                  4, 0, 0, 1 };
+  std::vector<float> bias_data{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+  // clang-format on
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int input = cgen.addTensor({{1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int weight = cgen.addTensor({{16, 4}, circle::TensorType::TensorType_FLOAT32, weight_buf});
+  int bias = cgen.addTensor({{16}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int output = cgen.addTensor({{1, 16}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorFullyConnected({{input, weight, bias}, {output}});
+  cgen.setInputsAndOutputs({input}, {output});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+      uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
+  _context->setBackends({"cpu", "acl_neon"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_FullyConnected16x1)
+{
+  CircleGen cgen;
+  // clang-format off
+  std::vector<float> weight_data{ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+                                  1, -1, 2, 1, 1, -1, 2, 1, 1, -1, 2, 1, 1, -1, 2, 1};
+  std::vector<float> bias_data{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+  // clang-format on
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int input = cgen.addTensor({{1, 4}, circle::TensorType::TensorType_FLOAT32});
+  CircleGen::SparsityParams sp{
+      {0, 1, 2, 3},
+      {0, 1},
+      {{CircleGen::SparseDimensionType::DimensionType_DENSE, 1},
+       {CircleGen::SparseDimensionType::DimensionType_SPARSE_CSR, {0, 2}, {0, 3}},
+       {CircleGen::SparseDimensionType::DimensionType_DENSE, 16},
+       {CircleGen::SparseDimensionType::DimensionType_DENSE, 1}}};
+  int weight = cgen.addTensor({{16, 4}, circle::TensorType::TensorType_FLOAT32, weight_buf}, sp);
+  int bias = cgen.addTensor({{16}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int output = cgen.addTensor({{1, 16}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorFullyConnected({{input, weight, bias}, {output}});
+  cgen.setInputsAndOutputs({input}, {output});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+      uniformTCD<float>({{1, 3, 2, 1}}, {{2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 5, 2, 1, 5, 6}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/If.cc b/tests/nnfw_api/src/one_op_tests/If.cc
new file mode 100644
index 000000000..4ec294223
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/If.cc
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_If)
+{
+  // The model looks just like the below pseudocode
+  //
+  // function model(x)
+  // {
+  //   if (x < 0.0)
+  //     return -100.0;
+  //   else
+  //     return 100.0;
+  // }
+
+  CircleGen cgen;
+
+  // constant buffers
+  std::vector<float> comp_data{0.0};
+  uint32_t comp_buf = cgen.addBuffer(comp_data);
+  std::vector<float> then_data{-100};
+  uint32_t then_buf = cgen.addBuffer(then_data);
+  std::vector<float> else_data{100};
+  uint32_t else_buf = cgen.addBuffer(else_data);
+
+  // primary subgraph
+  {
+    int x = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int comp = cgen.addTensor({{1}, circle::TensorType_FLOAT32, comp_buf});
+    int cond = cgen.addTensor({{1}, circle::TensorType_BOOL});
+    cgen.addOperatorLess({{x, comp}, {cond}});
+
+    int ret = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    cgen.addOperatorIf({{cond}, {ret}}, 1, 2);
+
+    cgen.setInputsAndOutputs({x}, {ret});
+  }
+
+  // then subgraph
+  {
+    cgen.nextSubgraph();
+    int ret = cgen.addTensor({{1}, circle::TensorType_FLOAT32, then_buf});
+    cgen.setInputsAndOutputs({}, {ret});
+  }
+
+  // else subgraph
+  {
+    cgen.nextSubgraph();
+    int ret = cgen.addTensor({{1}, circle::TensorType_FLOAT32, else_buf});
+    cgen.setInputsAndOutputs({}, {ret});
+  }
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{-1.0}}, {{-100.0}}));
+  _context->addTestCase(uniformTCD<float>({{1.0}}, {{100.0}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+class IfWrongSubgraphIndex : public GenModelTest,
+                             public ::testing::WithParamInterface<std::pair<int, int>>
+{
+};
+
+TEST_P(IfWrongSubgraphIndex, neg_Test)
+{
+  // These values must be less than 0 or greater than 2
+  int then_subg = GetParam().first;
+  int else_subg = GetParam().second;
+
+  // When If operation's subgraph index is invalid
+
+  CircleGen cgen;
+
+  // constant buffers
+  std::vector<float> then_data{-100};
+  uint32_t then_buf = cgen.addBuffer(then_data);
+  std::vector<float> else_data{100};
+  uint32_t else_buf = cgen.addBuffer(else_data);
+
+  // primary subgraph
+  {
+    int x = cgen.addTensor({{1}, circle::TensorType_BOOL});
+    int ret = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    cgen.addOperatorIf({{x}, {ret}}, then_subg, else_subg);
+
+    cgen.setInputsAndOutputs({x}, {ret});
+  }
+
+  // then subgraph
+  {
+    cgen.nextSubgraph();
+    int ret = cgen.addTensor({{1}, circle::TensorType_FLOAT32, then_buf});
+    cgen.setInputsAndOutputs({}, {ret});
+  }
+
+  // else subgraph
+  {
+    cgen.nextSubgraph();
+    int ret = cgen.addTensor({{1}, circle::TensorType_FLOAT32, else_buf});
+    cgen.setInputsAndOutputs({}, {ret});
+  }
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+INSTANTIATE_TEST_CASE_P(GenModelTest, IfWrongSubgraphIndex,
+                        ::testing::Values(std::make_pair(99, 2), std::make_pair(-1, 2),
+                                          std::make_pair(1, 99), std::make_pair(1, -99),
+                                          std::make_pair(-99, 99)));
diff --git a/tests/nnfw_api/src/one_op_tests/InstanceNorm.cc b/tests/nnfw_api/src/one_op_tests/InstanceNorm.cc
new file mode 100644
index 000000000..6569ced21
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/InstanceNorm.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_InstanceNorm)
+{
+  CircleGen cgen;
+  uint32_t beta_buf = cgen.addBuffer(std::vector<float>{1});
+  uint32_t gamma_buf = cgen.addBuffer(std::vector<float>{2});
+  int beta = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, beta_buf});
+  int gamma = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, gamma_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorInstanceNorm({{in, beta, gamma}, {out}}, 0, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 1, 1, 1}}, {{2, 2, 2, 2}}));
+  _context->setBackends({"acl_cl", "acl_neon"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_InstanceNorm_InvalidActivation)
+{
+  CircleGen cgen;
+  uint32_t beta_buf = cgen.addBuffer(std::vector<float>{1});
+  uint32_t gamma_buf = cgen.addBuffer(std::vector<float>{2});
+  int beta = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, beta_buf});
+  int gamma = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, gamma_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorInstanceNorm({{in, beta, gamma}, {out}}, 0,
+                               static_cast<circle::ActivationFunctionType>(128) /* Invalid value*/);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/L2Normalization.cc b/tests/nnfw_api/src/one_op_tests/L2Normalization.cc
index 8b4b8f5b6..8e0ae6df2 100644
--- a/tests/nnfw_api/src/one_op_tests/L2Normalization.cc
+++ b/tests/nnfw_api/src/one_op_tests/L2Normalization.cc
@@ -26,9 +26,10 @@ TEST_F(GenModelTest, OneOp_L2Normalization)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{0, 3, 4, 0, 5, 12, 0, 8, 15, 0, 7, 24}},
-                         {{0, 0.6, 0.8, 0, 0.38461539149284363, 0.92307698726654053, 0,
-                           0.47058823704719543, 0.88235294818878174, 0, 0.28, 0.96}}});
+  _context->addTestCase(
+      uniformTCD<float>({{0, 3, 4, 0, 5, 12, 0, 8, 15, 0, 7, 24}},
+                        {{0, 0.6, 0.8, 0, 0.38461539149284363, 0.92307698726654053, 0,
+                          0.47058823704719543, 0.88235294818878174, 0, 0.28, 0.96}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc b/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc
index 9db911734..e17f34fb3 100644
--- a/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc
+++ b/tests/nnfw_api/src/one_op_tests/LeakyRelu.cc
@@ -25,8 +25,24 @@ TEST_F(GenModelTest, OneOp_LeakyRelu)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{0, 1.0, 3.0, 1.0, -1.0, -2.0f}}, {{0, 1.0, 3.0, 1.0, -0.5, -1.0}}});
+  _context->addTestCase(
+      uniformTCD<float>({{0, 1.0, 3.0, 1.0, -1.0, -2.0f}}, {{0, 1.0, 3.0, 1.0, -0.5, -1.0}}));
   _context->setBackends({"acl_cl", "acl_neon"});
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, neg_OneOp_LeakyRelu_InvalidType)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_UINT8});
+  int out = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorLeakyRelu({{in}, {out}}, 0.5);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc b/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc
new file mode 100644
index 000000000..b34b2e83f
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/LogSoftmax.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_LogSoftmax)
+{
+  // NOTE For tf lite the params are fixed as:
+  // beta = 1.0, axis = -1
+
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 1, 1, 4, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1, 4, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorLogSoftmax({{in}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->addTestCase(uniformTCD<float>(
+      {{0, -6, 2, 4, 3, -2, 10, 1}},
+      {{-.00247565, -6.00247, -2.12692, -.126928, -.00671534, -5.00671, -.000123374, -9.00012}}));
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_LogSoftmax_InvalidModel)
+{
+  CircleGen cgen;
+  int out = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorLogSoftmax({{}, {out}}); // No input tensor
+  cgen.setInputsAndOutputs({}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/OneHot.cc b/tests/nnfw_api/src/one_op_tests/OneHot.cc
new file mode 100644
index 000000000..e688e790d
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/OneHot.cc
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_OneHot_OffValueToConst)
+{
+  CircleGen cgen;
+  std::vector<int32_t> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  std::vector<float> off_value_data{0};
+  uint32_t off_value_buf = cgen.addBuffer(off_value_data);
+  int indices = cgen.addTensor({{1, 2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int on_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int off_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, off_value_buf});
+  int axis = 2;
+  int out = cgen.addTensor({{1, 2, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth, on_value, off_value}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices, on_value}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{1, 2, 0, 2});
+  tcd.addInput(std::vector<float>{1});
+  tcd.addOutput(std::vector<float>{0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_OneHot_OffValueToNotZero)
+{
+  CircleGen cgen;
+  std::vector<int32_t> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  int indices = cgen.addTensor({{1, 2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int on_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int off_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = 2;
+  int out = cgen.addTensor({{1, 2, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth, on_value, off_value}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices, on_value, off_value}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{1, 2, 0, 2});
+  tcd.addInput(std::vector<float>{1});
+  tcd.addInput(std::vector<float>{-1});
+  tcd.addOutput(std::vector<float>{-1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_OneHot_IndicesValueToNeg_OffValueToConst)
+{
+  CircleGen cgen;
+  std::vector<int32_t> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  std::vector<float> off_value_data{0};
+  uint32_t off_value_buf = cgen.addBuffer(off_value_data);
+  int indices = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int on_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int off_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, off_value_buf});
+  int axis = 2;
+  int out = cgen.addTensor({{2, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth, on_value, off_value}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices, on_value}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{1, 2, 0, -1});
+  tcd.addInput(std::vector<float>{1});
+  tcd.addOutput(std::vector<float>{0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_OneHot_IndicesValueToNeg_OffValueToVar)
+{
+  CircleGen cgen;
+  std::vector<int32_t> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  int indices = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int on_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int off_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = 2;
+  int out = cgen.addTensor({{2, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth, on_value, off_value}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices, on_value, off_value}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{1, 2, 0, -1});
+  tcd.addInput(std::vector<float>{1});
+  tcd.addInput(std::vector<float>{0});
+  tcd.addOutput(std::vector<float>{0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_OneHot_OneOperand)
+{
+  CircleGen cgen;
+  int indices = cgen.addTensor({{1, 2, 2}, circle::TensorType::TensorType_INT32});
+  int axis = 2;
+  int out = cgen.addTensor({{1, 2, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_OneHot_TwoOperands)
+{
+  CircleGen cgen;
+  std::vector<int> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  int indices = cgen.addTensor({{1, 2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int axis = 2;
+  int out = cgen.addTensor({{1, 2, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_OneHot_ThreeOperands)
+{
+  CircleGen cgen;
+  std::vector<int> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  int indices = cgen.addTensor({{1, 2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int on_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = 2;
+  int out = cgen.addTensor({{1, 2, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth, on_value}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices, on_value}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_OneHot_InvalidAxis)
+{
+  CircleGen cgen;
+  std::vector<int> depth_data{3};
+  uint32_t depth_buf = cgen.addBuffer(depth_data);
+  int indices = cgen.addTensor({{1, 2, 2}, circle::TensorType::TensorType_INT32});
+  int depth = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, depth_buf});
+  int on_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int off_value = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int axis = 4;
+  int out = cgen.addTensor({{1, 2, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorOneHot({{indices, depth, on_value, off_value}, {out}}, axis);
+  cgen.setInputsAndOutputs({indices, on_value, off_value}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Pad.cc b/tests/nnfw_api/src/one_op_tests/Pad.cc
index 10fe6c78a..63d02ab88 100644
--- a/tests/nnfw_api/src/one_op_tests/Pad.cc
+++ b/tests/nnfw_api/src/one_op_tests/Pad.cc
@@ -28,7 +28,8 @@ TEST_F(GenModelTest, OneOp_Pad)
   cgen.addOperatorPad({{in, padding}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}}});
+  _context->addTestCase(
+      uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
@@ -48,7 +49,7 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadRank)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
@@ -67,7 +68,7 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim0)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
@@ -86,7 +87,7 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim1)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/PadV2.cc b/tests/nnfw_api/src/one_op_tests/PadV2.cc
index 9f7ff9c0e..e613fe282 100644
--- a/tests/nnfw_api/src/one_op_tests/PadV2.cc
+++ b/tests/nnfw_api/src/one_op_tests/PadV2.cc
@@ -34,7 +34,8 @@ TEST_F(GenModelTest, OneOp_PadV2)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{1, 2, 3, 4}}, {{3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3}}});
+  _context->addTestCase(
+      uniformTCD<float>({{1, 2, 3, 4}}, {{3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -59,7 +60,7 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadRank)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
@@ -83,7 +84,7 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadDim0)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
@@ -107,7 +108,7 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadDim1)
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-  _context->setCompileFail();
+  _context->expectFailCompile();
 
   SUCCEED();
 }
diff --git a/tests/nnfw_api/src/one_op_tests/Rank.cc b/tests/nnfw_api/src/one_op_tests/Rank.cc
index ed9d67294..7af1b4540 100644
--- a/tests/nnfw_api/src/one_op_tests/Rank.cc
+++ b/tests/nnfw_api/src/one_op_tests/Rank.cc
@@ -17,26 +17,19 @@
 #include "GenModelTest.h"
 
 // WORKAROUND Handle int32_t type input/output
-union float_int {
-  int32_t i;
-  float f;
-};
-
 TEST_F(GenModelTest, OneOp_Rank)
 {
   CircleGen cgen;
   int in = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
   int out = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
 
-  // TODO handle many type in addTestCase
-  float_int output_data;
-  output_data.i = 4;
-
   cgen.addOperatorRank({{in}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(
-      {{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}}, {{output_data.f}}});
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
+  tcd.addOutput(std::vector<int32_t>{4});
+  _context->addTestCase(tcd);
   _context->setBackends({"cpu"});
 
   SUCCEED();
@@ -49,14 +42,11 @@ TEST_F(GenModelTest, OneOp_Rank_Int32)
   int out = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
 
   // TODO handle many type in addTestCase
-  float_int output_data;
-  output_data.i = 4;
-
   cgen.addOperatorRank({{in}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(
-      {{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}}, {{output_data.f}}});
+  _context->addTestCase(uniformTCD<int32_t>(
+      {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}}, {{4}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc b/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
new file mode 100644
index 000000000..555d074a3
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_ResizeBilinear_SizeToConst)
+{
+  CircleGen cgen;
+  std::vector<int32_t> size_data{3, 3};
+  uint32_t size_buf = cgen.addBuffer(size_data);
+  int size = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, size_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorResizeBilinear({{in, size}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+      {{1, 1, 2, 2}}, {{1, 1, 1, 1.666666667, 1.666666667, 1.666666667, 2, 2, 2}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_ResizeBilinear_SizeToVar)
+{
+  CircleGen cgen;
+  int size = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorResizeBilinear({{in, size}, {out}});
+  cgen.setInputsAndOutputs({in, size}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{3, 3});
+  tcd.addInput(std::vector<float>{1, 1, 2, 2});
+  tcd.addOutput(std::vector<float>{1, 1, 1, 1.666666667, 1.666666667, 1.666666667, 2, 2, 2});
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_ResizeBilinear_InvalidSizeVal)
+{
+  CircleGen cgen;
+  std::vector<int32_t> size_data{-3, 3};
+  uint32_t size_buf = cgen.addBuffer(size_data);
+  int size = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, size_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorResizeBilinear({{in, size}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc b/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc
index 94f45d4a5..d1617c33a 100644
--- a/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc
+++ b/tests/nnfw_api/src/one_op_tests/ResizeNearestNeighbor.cc
@@ -30,8 +30,9 @@ TEST_F(GenModelTest, OneOp_ResizeNearestNeighbor)
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{3, 4, 6, 10, 9, 10, 12, 16}},
-                         {{3, 4, 3, 4, 6, 10, 3, 4, 3, 4, 6, 10, 9, 10, 9, 10, 12, 16}}});
+  _context->addTestCase(
+      uniformTCD<float>({{3, 4, 6, 10, 9, 10, 12, 16}},
+                        {{3, 4, 3, 4, 6, 10, 3, 4, 3, 4, 6, 10, 9, 10, 9, 10, 12, 16}}));
   _context->setBackends({"acl_cl"});
 
   SUCCEED();
diff --git a/tests/nnfw_api/src/one_op_tests/Reverse.cc b/tests/nnfw_api/src/one_op_tests/Reverse.cc
new file mode 100644
index 000000000..ef0c5fe82
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Reverse.cc
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_ReverseV2_3D)
+{
+  CircleGen cgen;
+
+  int in = cgen.addTensor({{4, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int out = cgen.addTensor({{4, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorReverseV2({{in, axis}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "cpu"});
+  _context->addTestCase(uniformTCD<float>(
+      {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}},
+      {{5, 6, 3, 4, 1, 2, 11, 12, 9, 10, 7, 8, 17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}}));
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_ReverseV2_1D)
+{
+  CircleGen cgen;
+
+  int in = cgen.addTensor({{4}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> axis_data{0};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int out = cgen.addTensor({{4}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorReverseV2({{in, axis}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "cpu"});
+  _context->addTestCase(uniformTCD<float>({{1, 2, 3, 4}}, {{4, 3, 2, 1}}));
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Split.cc b/tests/nnfw_api/src/one_op_tests/Split.cc
new file mode 100644
index 000000000..1e91efec8
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Split.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_Split)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 4}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+
+  int out1 = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out2 = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorSplit({{axis, in}, {out1, out2}}, 2);
+  cgen.setInputsAndOutputs({in}, {out1, out2});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+      uniformTCD<float>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6}, {3, 4, 7, 8}}));
+  _context->setBackends({"cpu", "acl_cl", "acl_neon"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_SplitNonConstAxis)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 4}, circle::TensorType::TensorType_FLOAT32});
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32});
+
+  int out1 = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out2 = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorSplit({{axis, in}, {out1, out2}}, 2);
+  cgen.setInputsAndOutputs({axis, in}, {out1, out2});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+
+  TestCaseData tcd;
+  tcd.addInput(std::vector<int32_t>{1});
+  tcd.addInput(std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+  tcd.addOutput(std::vector<float>{1, 2, 5, 6});
+  tcd.addOutput(std::vector<float>{3, 4, 7, 8});
+
+  _context->addTestCase(tcd);
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/StridedSlice.cc b/tests/nnfw_api/src/one_op_tests/StridedSlice.cc
new file mode 100644
index 000000000..fb29018d4
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/StridedSlice.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_StridedSlice_LastDim)
+{
+  CircleGen cgen;
+  std::vector<int32_t> begin_data{0, 3};
+  std::vector<int32_t> end_data{0, 6};
+  std::vector<int32_t> strides_data{1, 1};
+  uint32_t begin_buf = cgen.addBuffer(begin_data);
+  uint32_t end_buf = cgen.addBuffer(end_data);
+  uint32_t strides_buf = cgen.addBuffer(strides_data);
+  int input = cgen.addTensor({{1, 6}, circle::TensorType::TensorType_FLOAT32});
+  int begin = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, begin_buf});
+  int end = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, end_buf});
+  int strides = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, strides_buf});
+  int out = cgen.addTensor({{1, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorStridedSlice({{input, begin, end, strides}, {out}}, 1, 1);
+  cgen.setInputsAndOutputs({input}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{1, 2, 3, 4, 5, 6}}, {{4, 5, 6}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Tile.cc b/tests/nnfw_api/src/one_op_tests/Tile.cc
new file mode 100644
index 000000000..5fa76fc6d
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Tile.cc
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_Tile_ConstMul)
+{
+  CircleGen cgen;
+  std::vector<int32_t> mul_data{1, 2};
+  uint32_t mul_buf = cgen.addBuffer(mul_data);
+  int in = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int mul = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, mul_buf});
+  int out = cgen.addTensor({{2, 6}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTile({{in, mul}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+      uniformTCD<float>({{1, 2, 3, 4, 5, 6}}, {{1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Tile_MulToConst)
+{
+  CircleGen cgen;
+  std::vector<int32_t> multiplies_data{2, 3, 1};
+  uint32_t multiplies_buf = cgen.addBuffer(multiplies_data);
+  int multiplies = cgen.addTensor({{3}, circle::TensorType::TensorType_INT32, multiplies_buf});
+  int in = cgen.addTensor({{1, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2, 6, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTile({{in, multiplies}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+      {{11, 12, 13, 21, 22, 23}},
+      {{11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
+        11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Tile_MulToVar)
+{
+  CircleGen cgen;
+  int multiplies = cgen.addTensor({{3}, circle::TensorType::TensorType_INT32});
+  int in = cgen.addTensor({{1, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2, 6, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTile({{in, multiplies}, {out}});
+  cgen.setInputsAndOutputs({in, multiplies}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{11, 12, 13, 21, 22, 23});
+  tcd.addInput(std::vector<int32_t>{2, 3, 1});
+  tcd.addOutput(std::vector<float>{11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
+                                   11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23,
+                                   11, 12, 13, 21, 22, 23, 11, 12, 13, 21, 22, 23});
+  _context->addTestCase(tcd);
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Tile_VarMul)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int mul = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32});
+  int out = cgen.addTensor({{2, 6}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTile({{in, mul}, {out}});
+  cgen.setInputsAndOutputs({in, mul}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 2, 3, 4, 5, 6});
+  tcd.addInput(std::vector<int32_t>{1, 2});
+  tcd.addOutput(std::vector<float>{1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6});
+  _context->addTestCase(tcd);
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Tile)
+{
+  CircleGen cgen;
+  std::vector<int32_t> mul_data{1, 2, 1, 2};
+  uint32_t mul_buf = cgen.addBuffer(mul_data);
+  int in = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
+  // 2D multiples input is not supported
+  int mul = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_INT32, mul_buf});
+  int out = cgen.addTensor({{2, 6}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTile({{in, mul}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Tile_InvalidMulSize)
+{
+  CircleGen cgen;
+  std::vector<int32_t> multiplies_data{2, 6};
+  uint32_t multiplies_buf = cgen.addBuffer(multiplies_data);
+  int multiplies = cgen.addTensor({{2}, circle::TensorType::TensorType_INT32, multiplies_buf});
+  int in = cgen.addTensor({{1, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2, 6, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTile({{in, multiplies}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Transpose.cc b/tests/nnfw_api/src/one_op_tests/Transpose.cc
new file mode 100644
index 000000000..f2e971198
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Transpose.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_Transpose_PermsToConst)
+{
+  CircleGen cgen;
+  std::vector<int32_t> perms_data{2, 0, 1, 3};
+  uint32_t perms_buf = cgen.addBuffer(perms_data);
+  int perms = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32, perms_buf});
+  int in = cgen.addTensor({{2, 3, 4, 5}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{2, 3, 4, 5}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTranspose({{in, perms}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>(
+      {{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,
+        18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+        36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
+        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+        72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+        90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119}},
+      {{0,  1,   2,   3,   4,   20,  21, 22,  23,  24,  40,  41,  42, 43,  44,  60,  61,  62,
+        63, 64,  80,  81,  82,  83,  84, 100, 101, 102, 103, 104, 5,  6,   7,   8,   9,   25,
+        26, 27,  28,  29,  45,  46,  47, 48,  49,  65,  66,  67,  68, 69,  85,  86,  87,  88,
+        89, 105, 106, 107, 108, 109, 10, 11,  12,  13,  14,  30,  31, 32,  33,  34,  50,  51,
+        52, 53,  54,  70,  71,  72,  73, 74,  90,  91,  92,  93,  94, 110, 111, 112, 113, 114,
+        15, 16,  17,  18,  19,  35,  36, 37,  38,  39,  55,  56,  57, 58,  59,  75,  76,  77,
+        78, 79,  95,  96,  97,  98,  99, 115, 116, 117, 118, 119}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Transpose_PermsToVar)
+{
+  CircleGen cgen;
+  int perms = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32});
+  int in = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 3, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTranspose({{in, perms}, {out}});
+  cgen.setInputsAndOutputs({in, perms}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 2, 3, 4, 5, 6});
+  tcd.addInput(std::vector<int32_t>{0, 2, 1, 3});
+  tcd.addOutput(std::vector<float>{1, 4, 2, 5, 3, 6});
+  _context->addTestCase(tcd);
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Transpose_RegularTranspose)
+{
+  CircleGen cgen;
+  int perms = cgen.addTensor({{0}, circle::TensorType::TensorType_INT32});
+  int in = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 3, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTranspose({{in, perms}, {out}});
+  cgen.setInputsAndOutputs({in, perms}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  TestCaseData tcd;
+  tcd.addInput(std::vector<float>{1, 2, 3, 4, 5, 6});
+  tcd.addInput(std::vector<int32_t>{});
+  tcd.addOutput(std::vector<float>{1, 4, 2, 5, 3, 6});
+  _context->addTestCase(tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Transpose_InvalidPermsSize)
+{
+  CircleGen cgen;
+  std::vector<int32_t> perms_data{0, 1, 2};
+  uint32_t perms_buf = cgen.addBuffer(perms_data);
+  int perms = cgen.addTensor({{3}, circle::TensorType::TensorType_INT32, perms_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTranspose({{in, perms}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Transpose_InvalidPermsVal)
+{
+  CircleGen cgen;
+  std::vector<int32_t> perms_data{-3, 3, 1, 2};
+  uint32_t perms_buf = cgen.addBuffer(perms_data);
+  int perms = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32, perms_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTranspose({{in, perms}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Transpose_DuplicatedPermsVal)
+{
+  CircleGen cgen;
+  std::vector<int32_t> perms_data{3, 3, 1, 2};
+  uint32_t perms_buf = cgen.addBuffer(perms_data);
+  int perms = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32, perms_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorTranspose({{in, perms}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/While.cc b/tests/nnfw_api/src/one_op_tests/While.cc
index 1d86e6d6a..8b909c29f 100644
--- a/tests/nnfw_api/src/one_op_tests/While.cc
+++ b/tests/nnfw_api/src/one_op_tests/While.cc
@@ -66,10 +66,130 @@ TEST_F(GenModelTest, OneOp_While)
   }
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase({{{0}}, {{100}}});
-  _context->addTestCase({{{2}}, {{102}}});
-  _context->addTestCase({{{22}}, {{102}}});
+  _context->addTestCase(uniformTCD<float>({{0}}, {{100}}));
+  _context->addTestCase(uniformTCD<float>({{2}}, {{102}}));
+  _context->addTestCase(uniformTCD<float>({{22}}, {{102}}));
+  _context->addTestCase(uniformTCD<float>({{100}}, {{100}}));
   _context->setBackends({"cpu"});
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, OneOp_While_TwoInputs)
+{
+  // The model looks just like the below pseudocode
+  //
+  // function model(x, end)
+  // {
+  //   while (x < end)
+  //   {
+  //     x = x + 10.0
+  //   }
+  //   return x
+  // }
+
+  CircleGen cgen;
+  std::vector<float> incr_data{10};
+  uint32_t incr_buf = cgen.addBuffer(incr_data);
+
+  // primary subgraph
+  {
+    int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    cgen.addOperatorWhile({{x_in, end_in}, {x_out, end_out}}, 1, 2);
+    cgen.setInputsAndOutputs({x_in, end_in}, {x_out});
+  }
+
+  // cond subgraph
+  {
+    cgen.nextSubgraph();
+    int x = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int result = cgen.addTensor({{1}, circle::TensorType_BOOL});
+    cgen.addOperatorLess({{x, end}, {result}});
+    cgen.setInputsAndOutputs({x, end}, {result});
+  }
+
+  // body subgraph
+  {
+    cgen.nextSubgraph();
+    int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int incr = cgen.addTensor({{1}, circle::TensorType_FLOAT32, incr_buf});
+    int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    cgen.addOperatorAdd({{x_in, incr}, {x_out}}, circle::ActivationFunctionType_NONE);
+    cgen.setInputsAndOutputs({x_in, end}, {x_out, end});
+  }
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{0}, {20}}, {{20}}));
+  _context->addTestCase(uniformTCD<float>({{5}, {30}}, {{35}}));
+  _context->addTestCase(uniformTCD<float>({{20}, {10}}, {{20}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+class WhileWrongSubgraphIndex : public GenModelTest,
+                                public ::testing::WithParamInterface<std::pair<int, int>>
+{
+};
+
+TEST_P(WhileWrongSubgraphIndex, neg_Test)
+{
+  // These values must be less than 0 or greater than 2
+  int cond_subg = GetParam().first;
+  int body_subg = GetParam().second;
+
+  // When While operation's subgraph index is invalid
+
+  CircleGen cgen;
+
+  // constant buffers
+  std::vector<float> incr_data{10};
+  uint32_t incr_buf = cgen.addBuffer(incr_data);
+
+  // primary subgraph
+  {
+    int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    cgen.addOperatorWhile({{x_in, end_in}, {x_out, end_out}}, cond_subg, body_subg);
+    cgen.setInputsAndOutputs({x_in, end_in}, {x_out});
+  }
+
+  // cond subgraph
+  {
+    cgen.nextSubgraph();
+    int x = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int result = cgen.addTensor({{1}, circle::TensorType_BOOL});
+    cgen.addOperatorLess({{x, end}, {result}});
+    cgen.setInputsAndOutputs({x, end}, {result});
+  }
+
+  // body subgraph
+  {
+    cgen.nextSubgraph();
+    int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int incr = cgen.addTensor({{1}, circle::TensorType_FLOAT32, incr_buf});
+    int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    int end = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+    cgen.addOperatorAdd({{x_in, incr}, {x_out}}, circle::ActivationFunctionType_NONE);
+    cgen.setInputsAndOutputs({x_in, end}, {x_out, end});
+  }
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+INSTANTIATE_TEST_CASE_P(GenModelTest, WhileWrongSubgraphIndex,
+                        ::testing::Values(std::make_pair(99, 2), std::make_pair(-1, 2),
+                                          std::make_pair(1, 99), std::make_pair(1, -99),
+                                          std::make_pair(-99, 99)));
diff --git a/tests/scripts/benchmark.sh b/tests/scripts/benchmark.sh
new file mode 100644
index 000000000..a6bb821b7
--- /dev/null
+++ b/tests/scripts/benchmark.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# TODO : tizen sdb support
+# TODO : multiple backend at once
+#
+# This benchmark tool works as follows:
+# 0. Prepare test-suite
+#
+# On building, set make target to build_test_suite. This will create test-suite.tar.gz under Product/out directory.
+# ```
+# $ make build_test_suite
+# ```
+#
+# 1. Install test-suite into target devices
+#   - On android, test-suite should be located on /data/local/tmp/
+#   - On Tizen, nnfw-test pacakge will install test-suite into /opt/usr/nnfw-test/
+#
+# 2. Prepare nnpackge
+#
+# 3. Run benchmark
+#
+# ```
+# $./benchmark.sh --backend=cpu --num_runs=5 --nnpackge=/path/to/nnpkg
+#
+# ```
+# 4. Result trace.json
+#  - trace.json is the result file
+
+SCRIPT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+## If no optional argument is passed, set for android
+TEST_ROOT=/data/local/tmp/
+BRIDGE=adb
+BACKENDS=cpu
+NUM_RUNS=3
+
+function Usage()
+{
+    echo "Usage: ./benchamrk.sh --bridge=adb --backends=cpu --num_runs=5 --nnpackge=/path/to/nnpkg"
+    echo ""
+    echo "--bridge                  : adb or sdb"
+    echo "--nnpackage=<dir>         : directory containing nnpackage"
+    echo "--num_runs                : number of runs"
+    echo "--backends                : backend list"
+}
+
+# Parse command argv
+for i in "$@"
+do
+    case $i in
+        -h|--help|help)
+            Usage
+            exit 1
+            ;;
+        --bridge=*)
+            BRIDGE=${i#*=}
+            ;;
+        --bridge)
+            BRIDGE="$2"
+            shift
+            ;;
+        --num_runs=*)
+            NUM_RUNS=${i#*=}
+            ;;
+        --num_runs)
+            NUM_RUNS="$2"
+            shift
+            ;;
+        --nnpackage=*)
+            NNPKG_PATH=${i#*=}
+            ;;
+        --nnpackage)
+            NNPKG_PATH="$2"
+            shift
+            ;;
+    esac
+    shift
+done
+
+
+NNPKG_PATH_TARGET=$TEST_ROOT/nnpkg/`basename $NNPKG_PATH`
+
+# 0. Push nnpackage into targeta
+echo "Pusing nnpackge into ${NNPKG_PATH_TARGET}"
+pushd $NNPKG_PATH/.. > /dev/null
+tar -zcf nnpkg.tar.gz `basename $NNPKG_PATH`
+$BRIDGE push nnpkg.tar.gz $TEST_ROOT
+rm nnpkg.tar.gz
+popd > /dev/null
+$BRIDGE shell mkdir -p $TEST_ROOT/nnpkg
+$BRIDGE shell tar -zxf $TEST_ROOT/nnpkg.tar.gz -C $TEST_ROOT/nnpkg
+$BRIDGE shell rm $TEST_ROOT/nnpkg.tar.gz
+
+# 1. Run
+$BRIDGE shell LD_LIBRARY_PATH=$TEST_ROOT/Product/out/lib OP_SEQ_MAX_NODE=1 TRACE_FILEPATH=$TEST_ROOT/trace.json BACKENDS=$BACKENDS $TEST_ROOT/Product/out/bin/nnpackage_run --nnpackage $NNPKG_PATH_TARGET -r $NUM_RUNS
+
+# 2. Pull result file
+echo "Pulling data from target to trace.json"
+$BRIDGE pull $TEST_ROOT/trace.json
+
+# 3. Clean up
+$BRIDGE shell rm -rf $TEST_ROOT/nnpkg
diff --git a/tests/scripts/command/prepare-model b/tests/scripts/command/prepare-model
index 3feb7a799..9fd790ebe 100644
--- a/tests/scripts/command/prepare-model
+++ b/tests/scripts/command/prepare-model
@@ -50,6 +50,12 @@ do
     shift
 done
 
+# Default download server url
+if [[ -z "$MODELFILE_SERVER" ]]; then
+    export MODELFILE_SERVER="http://npu.mooo.com/archive/tflite_test_model/"
+fi
+echo "Download from $MODELFILE_SERVER"
+
 if [[ $DOWNLOAD_MODEL == "all" ]] || [[ $DOWNLOAD_MODEL == "tflite" ]]; then
     # Download tflite models
     $INSTALL_DIR/test/models/run_test.sh --download=on --run=off --md5=$MD5_CHECK
diff --git a/tests/scripts/common_android.sh b/tests/scripts/common_android.sh
new file mode 100644
index 000000000..66601aea3
--- /dev/null
+++ b/tests/scripts/common_android.sh
@@ -0,0 +1,76 @@
+#!/system/bin/sh
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MY_PATH="$( cd "$( dirname "$0" )" && pwd )"
+
+SHELL_CMD=/system/bin/sh
+
+function get_result_of_benchmark_test()
+{
+    local DRIVER_BIN=$1
+    local MODEL=$2
+    local LOG_FILE=$3
+
+    local RET=0
+    $SHELL_CMD $MY_PATH/models/run_test_android.sh --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
+    RET=$?
+    if [[ $RET -ne 0 ]]; then
+        echo "Testing $MODEL aborted... exit code: $RET"
+        exit $RET
+    fi
+
+    local RESULT=`grep -E '^- MEAN ' $LOG_FILE | awk '{print $4}'`
+    echo "$RESULT"
+}
+
+function print_result_of_benchmark_test()
+{
+    local NAME=$1
+    local RESULT=$2
+    local RESULT_FILE=$3
+
+    echo "$NAME $RESULT" > $RESULT_FILE
+}
+
+function print_with_dots()
+{
+    PRINT_WIDTH=45
+    local MSG="$@"
+    pad="............................................."
+    padlength=$((PRINT_WIDTH- ${#MSG}))
+    printf '%s' "$MSG"
+    printf '%*.*s ' 0 $padlength "$pad"
+}
+
+
+function run_benchmark_and_print()
+{
+    local WRITE_FILE_NAME=$1
+    local MSG=$2
+    local MODEL=$3
+    local REPORT_MODEL_DIR=$4
+    local PAUSE_TIME_IN_SEC=$5
+    local DRIVER_BIN=$6
+    local BENCHMARK_RUN_TEST_SH=$7
+
+    LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt
+    RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result
+    print_with_dots $MSG
+    RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE)
+    echo "$RESULT ms"
+    print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE
+    sleep $PAUSE_TIME_IN_SEC
+}
diff --git a/tests/scripts/models/nnfw_api_gtest/add_unspecified_rank_inputs/config.sh b/tests/scripts/models/nnfw_api_gtest/add_unspecified_rank_inputs/config.sh
deleted file mode 100644
index 2cc30f915..000000000
--- a/tests/scripts/models/nnfw_api_gtest/add_unspecified_rank_inputs/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="add_unspecified_rank_inputs.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/dynamic_tensor_reshape/config.sh b/tests/scripts/models/nnfw_api_gtest/dynamic_tensor_reshape/config.sh
deleted file mode 100644
index 8496509cb..000000000
--- a/tests/scripts/models/nnfw_api_gtest/dynamic_tensor_reshape/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="dynamic_tensor_reshape.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/input_reshaping_add/config.sh b/tests/scripts/models/nnfw_api_gtest/input_reshaping_add/config.sh
deleted file mode 100644
index 4e5a636ee..000000000
--- a/tests/scripts/models/nnfw_api_gtest/input_reshaping_add/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="input_reshaping_add.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/neg/config.sh b/tests/scripts/models/nnfw_api_gtest/neg/config.sh
deleted file mode 100644
index fdf093686..000000000
--- a/tests/scripts/models/nnfw_api_gtest/neg/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="neg.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/unknown_dim_input_concat/config.sh b/tests/scripts/models/nnfw_api_gtest/unknown_dim_input_concat/config.sh
deleted file mode 100644
index a3cab4787..000000000
--- a/tests/scripts/models/nnfw_api_gtest/unknown_dim_input_concat/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="unknown_dim_input_concat.zip"
diff --git a/tests/scripts/models/run_test.sh b/tests/scripts/models/run_test.sh
index 32a277859..2db6c12e1 100755
--- a/tests/scripts/models/run_test.sh
+++ b/tests/scripts/models/run_test.sh
@@ -155,7 +155,7 @@ run_tests()
         # Test configure initialization
         ((i++))
         STATUS="enabled"
-        MODELFILE_SERVER_PATH=""
+        MODELFILE_URL_BASE=""
         MODELFILE_NAME=""
         source $TEST_ROOT_PATH/$TEST_NAME/config.sh
 
@@ -168,13 +168,12 @@ run_tests()
             continue
         fi
 
-        TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
-        MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
+        MODELFILE=$CACHE_ROOT_PATH/$MODELFILE_NAME
 
         # Find model file for downloaded by zip
         if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
-            pushd $TEST_CACHE_PATH
-            MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite)
+            pushd $CACHE_ROOT_PATH
+            MODELFILE=$CACHE_ROOT_PATH/$(ls ${MODELFILE_NAME%.zip}/*.tflite)
             popd
         fi
 
@@ -211,20 +210,19 @@ download_tests()
     for TEST_NAME in $SELECTED_TESTS; do
         # Test configure initialization
         ((i++))
-        MODELFILE_SERVER_PATH=""
+        MODELFILE_URL_BASE=""
         MODELFILE_NAME=""
         source $TEST_ROOT_PATH/$TEST_NAME/config.sh
 
-        TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
-        MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
-        MODELFILE_URL="$MODELFILE_SERVER/$MODELFILE_NAME"
+        MODELFILE=$CACHE_ROOT_PATH/$MODELFILE_NAME
+        MODELFILE_URL="$MODELFILE_URL_BASE/$MODELFILE_NAME"
         if [ -n  "$FIXED_MODELFILE_SERVER" ]; then
             MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME"
         fi
 
         # Download model file
-        if [ ! -e $TEST_CACHE_PATH ]; then
-            mkdir -p $TEST_CACHE_PATH
+        if [ ! -e $CACHE_ROOT_PATH ]; then
+            mkdir -p $CACHE_ROOT_PATH
         fi
 
         # Download unless we have it in cache (Also check md5sum)
@@ -234,10 +232,10 @@ download_tests()
             echo "======================"
 
             rm -f $MODELFILE # Remove invalid file if exists
-            pushd $TEST_CACHE_PATH
+            pushd $CACHE_ROOT_PATH
             wget -nv $MODELFILE_URL
             if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
-                unzip -o $MODELFILE_NAME
+                unzip -o $MODELFILE_NAME -d ${MODELFILE_NAME%.zip}
             fi
             popd
         fi
diff --git a/tests/scripts/models/run_test_android.sh b/tests/scripts/models/run_test_android.sh
new file mode 100644
index 000000000..74b5cdd46
--- /dev/null
+++ b/tests/scripts/models/run_test_android.sh
@@ -0,0 +1,186 @@
+#!/system/bin/sh
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MY_PATH="$( cd "$( dirname "$0" )" && pwd )"
+NNFW_HOME="$(dirname $(dirname $(dirname ${MY_PATH})))"
+
+CACHE_ROOT_PATH=$MY_PATH/"cache"
+TEST_ROOT_PATH=$MY_PATH/"tflite"
+REPORT_DIR="report"
+
+RUN_DISABLED="true"
+
+function Usage()
+{
+    echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}"
+    echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2"
+    echo ""
+    echo "--run                 - (default=on) Test model files"
+    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests"
+    echo "--reportdir           - (default=report) Directory to place tap files"
+    echo "--tapname             - (default=framework_test.tap) File name to be written for tap"
+    echo "--configdir           - (default=$TEST_ROOT_PATH) Config directory to download and test model"
+    echo "--cachedir            - (default=$CACHE_ROOT_PATH) Directory to download model"
+    echo ""
+}
+
+DRIVER_BIN=""
+TAP_NAME="framework_test.tap"
+TEST_LIST=()
+RUN_TEST="on"
+MD5_CHECK="off"
+
+# Support environment variable setting for mirror server
+FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}"
+
+for i in "$@"
+do
+    case $i in
+        -h|--help|help)
+            Usage
+            exit 1
+            ;;
+        --driverbin=*)
+            DRIVER_BIN=${i#*=}
+            ;;
+        --reportdir=*)
+            REPORT_DIR=${i#*=}
+            ;;
+        --tapname=*)
+            TAP_NAME=${i#*=}
+            ;;
+        --run=*)
+            RUN_TEST=${i#*=}
+            ;;
+        --configdir=*)
+            TEST_ROOT_PATH=${i#*=}
+            ;;
+        --cachedir=*)
+            CACHE_ROOT_PATH=${i#*=}
+            ;;
+        *)
+            TEST_LIST+=( $i )
+            ;;
+    esac
+    shift
+done
+
+if [[ ${#TEST_LIST[@]} -eq 0 ]]; then
+    RUN_DISABLED="false"
+fi
+
+if [ ! -n "$DRIVER_BIN" ]; then
+    DRIVER_BIN="$NNFW_HOME/Product/out/bin/tflite_run"
+fi
+
+if [ ! -d "$TEST_ROOT_PATH" ]; then
+    echo "Cannot find config directory for test: please set proper configdir"
+    exit 1
+fi
+
+run_tests()
+{
+    echo "1..$#" > $REPORT_DIR/$TAP_NAME
+    SELECTED_TESTS=$@
+
+    echo ""
+    echo "Running tests:"
+    echo "======================"
+    for TEST_NAME in $SELECTED_TESTS; do
+        echo $TEST_NAME
+    done
+    echo "======================"
+
+    TOTAL_RESULT=0  # 0(normal) or 1(abnormal)
+    i=0
+    for TEST_NAME in $SELECTED_TESTS; do
+        # Test configure initialization
+        ((i++))
+        STATUS="enabled"
+        MODELFILE_SERVER_PATH=""
+        MODELFILE_NAME=""
+        source $TEST_ROOT_PATH/$TEST_NAME/config.sh
+
+        LOWER_STATUS="$(echo $STATUS | awk '{print tolower($0)}')"
+        if [ "$LOWER_STATUS" == "disabled" ] && [ "$RUN_DISABLED" == "false" ]; then
+            echo ""
+            echo "Skip $TEST_NAME"
+            echo "======================"
+            echo "ok $i # skip $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
+            continue
+        fi
+
+        MODELFILE=$CACHE_ROOT_PATH/$MODELFILE_NAME
+
+        # Find model file for downloaded by zip
+        if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
+            __PWD=$(pwd)
+            cd $CACHE_ROOT_PATH
+            MODELFILE=$CACHE_ROOT_PATH/$(ls ${MODELFILE_NAME%.zip}/*.tflite)
+            cd $__PWD
+        fi
+
+        echo ""
+        echo "Run $TEST_NAME"
+        echo "======================"
+
+        # Run driver to test framework
+        $DRIVER_BIN $MODELFILE
+
+        if [[ $? -eq 0 ]]; then
+            echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
+        else
+            echo "not ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
+            TOTAL_RESULT=1
+        fi
+    done
+    return $TOTAL_RESULT
+}
+
+find_tests()
+{
+    local TEST_DIRS="$@"
+    local TESTS_TO_RUN=""
+
+    if [[ $# -eq 0 ]]; then
+        TEST_DIRS="."
+    fi
+
+    shift $#
+
+    __PWD=$(pwd)
+    cd $TEST_ROOT_PATH
+    for DIR in $TEST_DIRS; do
+        if [ -d "$DIR" ]; then
+            TESTS_FOUND=$(find "$DIR" -type f -name 'config.sh' -exec dirname {} \;| sed 's|^./||' | sort)
+            TESTS_TO_RUN="$TESTS_TO_RUN $TESTS_FOUND"
+        else
+            echo "Test $DIR was not found. This test is not added." 1>&2
+        fi
+    done
+    cd $__PWD
+
+    echo $TESTS_TO_RUN
+}
+
+mkdir -p $REPORT_DIR
+TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]})
+
+if [ "$RUN_TEST" = "on" ]; then
+    run_tests $TESTS_TO_RUN
+fi
+
+exit 0
diff --git a/tests/scripts/models/tflite/MODELS/inception_nonslim/config.sh b/tests/scripts/models/tflite/MODELS/inception_nonslim/config.sh
index 39f5d772d..69c48d029 100755
--- a/tests/scripts/models/tflite/MODELS/inception_nonslim/config.sh
+++ b/tests/scripts/models/tflite/MODELS/inception_nonslim/config.sh
@@ -1,2 +1,2 @@
-MODELFILE_SERVER_PATH="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
+MODELFILE_URL_BASE="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
 MODELFILE_NAME="inception_v3_2015_2017_11_10.zip"
diff --git a/tests/scripts/models/tflite/MODELS/inception_slim/config.sh b/tests/scripts/models/tflite/MODELS/inception_slim/config.sh
index 1c0cf3ef5..dc798a555 100755
--- a/tests/scripts/models/tflite/MODELS/inception_slim/config.sh
+++ b/tests/scripts/models/tflite/MODELS/inception_slim/config.sh
@@ -1,2 +1,2 @@
-MODELFILE_SERVER_PATH="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
+MODELFILE_URL_BASE="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
 MODELFILE_NAME="inception_v3_slim_2016_android_2017_11_10.zip"
diff --git a/tests/scripts/models/tflite/MODELS/mobilenet/config.sh b/tests/scripts/models/tflite/MODELS/mobilenet/config.sh
index b23d687cd..8ee45f7fb 100755
--- a/tests/scripts/models/tflite/MODELS/mobilenet/config.sh
+++ b/tests/scripts/models/tflite/MODELS/mobilenet/config.sh
@@ -1,2 +1,2 @@
-MODELFILE_SERVER_PATH="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
+MODELFILE_URL_BASE="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
 MODELFILE_NAME="mobilenet_v1_0.25_128_float_2017_11_08.zip"
diff --git a/tests/scripts/models/tflite/MODELS/mobilenet_quant8/config.sh b/tests/scripts/models/tflite/MODELS/mobilenet_quant8/config.sh
index 2e304df92..e4700fc85 100755
--- a/tests/scripts/models/tflite/MODELS/mobilenet_quant8/config.sh
+++ b/tests/scripts/models/tflite/MODELS/mobilenet_quant8/config.sh
@@ -1,2 +1,2 @@
-MODELFILE_SERVER_PATH="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
+MODELFILE_URL_BASE="https://storage.googleapis.com/download.tensorflow.org/models/tflite"
 MODELFILE_NAME="mobilenet_v1_1.0_224_quant_and_labels.zip"
diff --git a/tests/scripts/test_scheduler_with_profiling_android.sh b/tests/scripts/test_scheduler_with_profiling_android.sh
new file mode 100644
index 000000000..48576a98d
--- /dev/null
+++ b/tests/scripts/test_scheduler_with_profiling_android.sh
@@ -0,0 +1,230 @@
+#!/system/bin/sh
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# How to run benchmark testing
+#
+# This script is copy of test_scheduler_with_profiling.sh for Android.
+# As Android does not provide bash, this with models/run_test_android.sh
+# and common_android.sh, three scripts are modified for Android benchmark
+# testing using Android shell.
+# Test models are downloaded into models folder but as Android also doesn't
+# provide downloading in shell script, user should push downloaded models
+# to Android device also.
+#
+# 1. To download test models,
+#    run test_scheduler_with_profiling.sh from in Ubuntu/ARM device
+# 2. You will have download models in tests/scripts/models/cache folder
+# 3. Build for OneRT for Android
+# 4. Copy files
+#    adb shell mkdir -p /data/local/tmp/Product/report/benchmark
+#    adb push tests /data/local/tmp/.
+#    adb push Product/aarch64-android.release/out /data/local/tmp/Product/.
+#
+# 5. Run benchmark inside Android shell
+#    export LD_LIBRARY_PATH=/data/local/tmp/Product/out/lib
+#    cd /data/local/tmp
+#    sh /data/local/tmp/tests/scripts/test_scheduler_with_profiling_android.sh
+#
+
+MY_PATH="$( cd "$( dirname "$0" )" && pwd )"
+
+SHELL_CMD=/system/bin/sh
+
+source $MY_PATH/common_android.sh
+
+BACKEND_CNT=3
+# Run profiler BACKEND_CNT+1 times: on each run of the first BACKEND_CNT runs it will
+#     collect metrics for one unmeasured backend. On the last run metrics for data transfer
+PROFILING_RUN_CNT=$((BACKEND_CNT+1))
+TEST_DRIVER_DIR="$( cd "$( dirname "$0" )" && pwd )"
+
+ARTIFACT_PATH="$TEST_DRIVER_DIR/../.."
+BENCHMARK_DRIVER_BIN=$ARTIFACT_PATH/Product/out/bin/tflite_run
+REPORT_DIR=$ARTIFACT_PATH/report
+RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/models/run_test_android.sh
+BENCHMARK_MODEL_LIST="MODELS/inception_nonslim MODELS/inception_slim MODELS/mobilenet"
+
+if [ ! -e "$RUN_TEST_SH" ]; then
+    echo "Cannot find $RUN_TEST_SH"
+    exit 1
+fi
+
+BENCHMARK_REPORT_DIR=$REPORT_DIR/benchmark
+BENCHMARK_MODELS_FILE=$BENCHMARK_REPORT_DIR/benchmark_models.txt
+
+function run_without_sched()
+{
+    local RESULT_SCH_INT=$1
+    local REPORT_MODEL_DIR=$2
+    local MODEL=$3
+    local EXECUTOR=$4
+    local BACKEND=$5
+
+    #LOG_FILE=$REPORT_MODEL_DIR/tflite_${EXECUTOR,,}_$BACKEND.txt
+    LOG_FILE=$REPORT_MODEL_DIR/tflite_$EXECUTOR_$BACKEND.txt
+    export OP_BACKEND_ALLOPS=$BACKEND
+    export EXECUTOR=$EXECUTOR
+
+    print_with_dots "$EXECUTOR $BACKEND without scheduler"
+
+    RESULT=$(get_result_of_benchmark_test $BENCHMARK_DRIVER_BIN $MODEL $LOG_FILE)
+
+    # printf -v RESULT_INT '%d' $RESULT 2>/dev/null
+    RESULT_I=$(printf "%.0f" $RESULT)
+    RESULT_INT=$(expr $RESULT_I)
+    PERCENTAGE=$((100 - RESULT_SCH_INT * 100 / RESULT_INT))
+    echo "$RESULT ms. Parallel scheduler is $PERCENTAGE % faster"
+}
+
+function run_benchmark_test()
+{
+    local LOG_FILE=
+    local RESULT=
+    local REPORT_MODEL_DIR=
+
+    export COUNT=5
+    echo "============================================"
+    local i=0
+    export USE_NNAPI=1
+    export BACKENDS="acl_cl;acl_neon;cpu"
+    # Remove metrics so that profiler can get metrics for operations
+    #      with input&output sizes the same as the model
+    rm "exec_time.json" 2>/dev/null
+    for MODEL in $BENCHMARK_MODEL_LIST; do
+
+        echo "Benchmark test with `basename $BENCHMARK_DRIVER_BIN` & `echo $MODEL`"
+        echo $MODEL >> $BENCHMARK_MODELS_FILE
+
+        REPORT_MODEL_DIR=$BENCHMARK_REPORT_DIR/scheduler_benchmark/$MODEL
+        mkdir -p $REPORT_MODEL_DIR
+
+##################################################################################
+        # Get metrics by running profiler
+##################################################################################
+        export USE_SCHEDULER=1
+        export PROFILING_MODE=1
+        export EXECUTOR="Dataflow"
+        export ONERT_LOG_ENABLE=1
+        for j in 1 2 3 4; do # 1 to $PROFILING_RUN_CNT
+            # Save the verbose log of each run
+            LOG_FILE=$REPORT_MODEL_DIR/tflite_profiling_$j.txt
+
+            print_with_dots "Profiling run #$j out of $PROFILING_RUN_CNT"
+
+            $SHELL_CMD $RUN_TEST_SH --driverbin=$BENCHMARK_DRIVER_BIN $MODEL > $LOG_FILE 2>&1
+            RET=$?
+            if [[ $RET -ne 0 ]]; then
+                echo "Profiling $MODEL aborted in run#$j... exit code: $RET"xX
+                exit $RET
+            fi
+            echo "finished"
+            # Save the exec_time.json of each run
+            cp "exec_time.json" $REPORT_MODEL_DIR/"exec_time_$j.json"
+        done
+        unset ONERT_LOG_ENABLE
+
+
+##################################################################################
+        # Turn off profiling
+##################################################################################
+        export PROFILING_MODE=0
+
+##################################################################################
+        # Run ParallelExecutor with scheduler
+##################################################################################
+        LOG_FILE=$REPORT_MODEL_DIR/tflite_parallel_with_scheduler.txt
+        export EXECUTOR="Parallel"
+        export GRAPH_DOT_DUMP=1
+        print_with_dots "Parallel with scheduler"
+
+        RESULT=$(get_result_of_benchmark_test $BENCHMARK_DRIVER_BIN $MODEL $LOG_FILE)
+        echo "$RESULT ms"
+
+        # printf -v RESULT_SCH_INT '%d' $RESULT 2>/dev/null
+        RESULT_I=$(printf "%.0f" $RESULT)
+        RESULT_SCH_INT=$(expr $RESULT_I)
+
+        mv "after_lower_subg-0.dot" $REPORT_MODEL_DIR/"after_lower_subg-0_parallel.dot"
+
+##################################################################################
+        # Run Linear executor with scheduler
+##################################################################################
+        LOG_FILE=$REPORT_MODEL_DIR/tflite_linear_with_scheduler.txt
+        export EXECUTOR="Linear"
+        export GRAPH_DOT_DUMP=1
+        print_with_dots "Linear with scheduler"
+
+        RESULT=$(get_result_of_benchmark_test $BENCHMARK_DRIVER_BIN $MODEL $LOG_FILE)
+
+        # printf -v RESULT_INT '%d' $RESULT 2>/dev/null
+        RESULT_I=$(printf "%.0f" $RESULT)
+        RESULT_INT=$(expr $RESULT_I)
+
+        PERCENTAGE=$((100 - $RESULT_SCH_INT * 100 / $RESULT_INT))
+
+        echo "$RESULT ms. Parallel scheduler is $PERCENTAGE % faster"
+
+        # Remove metrics so that for next model in profiler can get metrics
+        #   for operations with input&output sizes the same as the model
+        mv "exec_time.json" $REPORT_MODEL_DIR
+        # Save the dot graph
+        mv "after_lower_subg-0.dot" $REPORT_MODEL_DIR/"after_lower_subg-0_linear.dot"
+        unset GRAPH_DOT_DUMP
+
+##################################################################################
+        # Turn off scheduler
+##################################################################################
+        export USE_SCHEDULER=0
+
+        # Run LinearExecutor on acl_cl without scheduler
+        run_without_sched $RESULT_SCH_INT $REPORT_MODEL_DIR $MODEL "Linear" "acl_cl"
+
+        # Run LinearExecutor on acl_neon without scheduler
+        run_without_sched $RESULT_SCH_INT $REPORT_MODEL_DIR $MODEL "Linear" "acl_neon"
+
+        # Run LinearExecutor on cpu without scheduler
+        # run_without_sched $RESULT_SCH_INT $REPORT_MODEL_DIR $MODEL "Linear" "cpu"
+
+        # Run ParallelExecutor on acl_cl without scheduler
+        run_without_sched $RESULT_SCH_INT $REPORT_MODEL_DIR $MODEL "Parallel" "acl_cl"
+
+        # Run ParallelExecutor on acl_neon without scheduler
+        run_without_sched $RESULT_SCH_INT $REPORT_MODEL_DIR $MODEL "Parallel" "acl_neon"
+
+        # Run ParallelExecutor on cpi without scheduler
+        # run_without_sched $RESULT_SCH_INT $REPORT_MODEL_DIR $MODEL "Parallel" "cpu"
+
+        if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then
+            echo ""
+        fi
+        i=$((i+1))
+
+        unset USE_SCHEDULER
+        unset PROFILING_MODE
+        unset EXECUTOR
+        unset OP_BACKEND_ALLOPS
+    done
+    unset BACKENDS
+    echo "============================================"
+    unset COUNT
+    unset USE_NNAPI
+
+}
+
+echo ""
+run_benchmark_test
+echo ""
diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt
index ec45db4f6..afbcbeaca 100644
--- a/tests/tools/nnpackage_run/CMakeLists.txt
+++ b/tests/tools/nnpackage_run/CMakeLists.txt
@@ -32,7 +32,7 @@ endif(HDF5_FOUND)
 target_include_directories(nnpackage_run PRIVATE src)
 target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS})
 
-target_link_libraries(nnpackage_run onert_core onert tflite_loader)
+target_link_libraries(nnpackage_run tflite_loader)
 target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp)
 target_link_libraries(nnpackage_run nnfw-dev)
 target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc
index 90021bff3..c0f937797 100644
--- a/tests/tools/nnpackage_run/src/args.cc
+++ b/tests/tools/nnpackage_run/src/args.cc
@@ -216,13 +216,19 @@ void Args::Initialize(void)
          "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n"
          "{nnpkg} name may be changed to realpath if you use symbolic-link.")
     ("shape_prepare", po::value<std::string>()->default_value("[]")->notifier(process_shape_prepare),
-         "set shape of specified tensor before compilation (before calling nnfw_prepare()).\n"
-         "'h5': read shape(s) from H5 input file. '--load' should also be provided.\n"
-         "'[0, [1, 2], 2, []]': set 0th tensor to [1, 2] and 2nd tensor to [].")
+         "Please refer to the description of 'shape_run'")
     ("shape_run", po::value<std::string>()->default_value("[]")->notifier(process_shape_run),
-         "set shape of specified tensor before running (before calling nnfw_run()).\n"
+         "'--shape_prepare: set shape of tensors before compilation (before calling nnfw_prepare()).\n"
+         "'--shape_run: set shape of tensors before running (before calling nnfw_run()).\n"
+         "Allowed value:.\n"
+         "'[0, [1, 2], 2, []]': set 0th tensor to [1, 2] and 2nd tensor to [] (scalar).\n"
+#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
          "'h5': read shape(s) from H5 input file. '--load' should also be provided.\n"
-         "'[0, [1, 2], 2, []]': set 0th tensor to [1, 2] and 2nd tensor to [].")
+         "if '--load' option is provided but '--shape_prepare' or '--shape_run' is not provided,\n"
+         "'--shape_run h5' will be used by default.\n"
+#endif
+         "For detailed description, please consutl the description of nnfw_set_input_tensorinfo()\n"
+         )
     ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }),
          "Verbose level\n"
          "0: prints the only result. Messages btw run don't print\n"
@@ -249,6 +255,11 @@ void Args::Parse(const int argc, char **argv)
                                             "' cannot be given at once.");
       }
     };
+
+    // calling, e.g., "nnpackage_run .. -- shape_prepare .. --shape_run .." should theoretically
+    // work but allowing both options together on command line makes the usage and implemenation
+    // of nnpackage_run too complicated. Therefore let's not allow those option together.
+    conflicting_options("shape_prepare", "shape_run");
   }
 
   if (vm.count("help"))
@@ -288,4 +299,18 @@ void Args::Parse(const int argc, char **argv)
   }
 }
 
+bool Args::shapeParamProvided()
+{
+  bool provided = false;
+#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
+  // "--shape_run h5" or "--shape_prepare h5" was provided
+  provided = (getWhenToUseH5Shape() != WhenToUseH5Shape::NOT_PROVIDED);
+#endif
+  // specific shape was provided
+  // e.g., "--shape_run '[0, [10, 1]]'" or "--shape_prepare '[0, [10, 1]]'"
+  provided |= (!getShapeMapForPrepare().empty()) || (!getShapeMapForRun().empty());
+
+  return provided;
+}
+
 } // end of namespace nnpkg_run
diff --git a/tests/tools/nnpackage_run/src/args.h b/tests/tools/nnpackage_run/src/args.h
index d2b33fc82..11fd00023 100644
--- a/tests/tools/nnpackage_run/src/args.h
+++ b/tests/tools/nnpackage_run/src/args.h
@@ -34,9 +34,9 @@ using TensorShapeMap = std::unordered_map<uint32_t, TensorShape>;
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
 enum class WhenToUseH5Shape
 {
-  DO_NOT_USE, // don't use shapes in h5 file
-  PREPARE,    // read shapes in h5 file and set them as inputs' shape before calling nnfw_prepare()
-  RUN,        // read shapes in h5 file and set them as inputs' shape before calling nnfw_run()
+  NOT_PROVIDED, // Param not provided
+  PREPARE, // read shapes in h5 file and set them as inputs' shape before calling nnfw_prepare()
+  RUN,     // read shapes in h5 file and set them as inputs' shape before calling nnfw_run()
 };
 #endif
 
@@ -62,6 +62,8 @@ public:
   const bool printVersion(void) const { return _print_version; }
   TensorShapeMap &getShapeMapForPrepare() { return _shape_prepare; }
   TensorShapeMap &getShapeMapForRun() { return _shape_run; }
+  /// @brief Return true if "--shape_run" or "--shape_prepare" is provided
+  bool shapeParamProvided();
   const int getVerboseLevel(void) const { return _verbose_level; }
 
 private:
@@ -76,7 +78,7 @@ private:
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
   std::string _dump_filename;
   std::string _load_filename;
-  WhenToUseH5Shape _when_to_use_h5_shape = WhenToUseH5Shape::DO_NOT_USE;
+  WhenToUseH5Shape _when_to_use_h5_shape = WhenToUseH5Shape::NOT_PROVIDED;
 #endif
   TensorShapeMap _shape_prepare;
   TensorShapeMap _shape_run;
diff --git a/tests/tools/nnpackage_run/src/nnpackage_run.cc b/tests/tools/nnpackage_run/src/nnpackage_run.cc
index a78e144d8..05632393b 100644
--- a/tests/tools/nnpackage_run/src/nnpackage_run.cc
+++ b/tests/tools/nnpackage_run/src/nnpackage_run.cc
@@ -143,11 +143,15 @@ int main(const int argc, char **argv)
 
 // set input shape before compilation
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
+
+    auto fill_shape_from_h5 = [&session](const std::string &h5_file, TensorShapeMap &shape_map) {
+      assert(!h5_file.empty());
+      auto shapes = H5Formatter(session).readTensorShapes(h5_file);
+      overwriteShapeMap(shape_map, shapes);
+    };
+
     if (args.getWhenToUseH5Shape() == WhenToUseH5Shape::PREPARE)
-    {
-      auto shapes = H5Formatter(session).readTensorShapes(args.getLoadFilename());
-      overwriteShapeMap(args.getShapeMapForPrepare(), shapes);
-    }
+      fill_shape_from_h5(args.getLoadFilename(), args.getShapeMapForPrepare());
 #endif
     setTensorInfo(args.getShapeMapForPrepare());
 
@@ -160,11 +164,9 @@ int main(const int argc, char **argv)
 
 // set input shape after compilation and before execution
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
-    if (args.getWhenToUseH5Shape() == WhenToUseH5Shape::RUN)
-    {
-      auto shapes = H5Formatter(session).readTensorShapes(args.getLoadFilename());
-      overwriteShapeMap(args.getShapeMapForRun(), shapes);
-    }
+    if (args.getWhenToUseH5Shape() == WhenToUseH5Shape::RUN ||
+        (!args.getLoadFilename().empty() && !args.shapeParamProvided()))
+      fill_shape_from_h5(args.getLoadFilename(), args.getShapeMapForRun());
 #endif
     setTensorInfo(args.getShapeMapForRun());
 
diff --git a/tools/cross/aarch64/sources.list.focal b/tools/cross/aarch64/sources.list.focal
new file mode 100644
index 000000000..4de2600c1
--- /dev/null
+++ b/tools/cross/aarch64/sources.list.focal
@@ -0,0 +1,11 @@
+deb http://ports.ubuntu.com/ubuntu-ports/ focal main restricted universe
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal main restricted universe
+
+deb http://ports.ubuntu.com/ubuntu-ports/ focal-updates main restricted universe
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal-updates main restricted universe
+
+deb http://ports.ubuntu.com/ubuntu-ports/ focal-backports main restricted
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal-backports main restricted
+
+deb http://ports.ubuntu.com/ubuntu-ports/ focal-security main restricted universe multiverse
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal-security main restricted universe multiverse
diff --git a/tools/cross/arm/sources.list.focal b/tools/cross/arm/sources.list.focal
new file mode 100644
index 000000000..4de2600c1
--- /dev/null
+++ b/tools/cross/arm/sources.list.focal
@@ -0,0 +1,11 @@
+deb http://ports.ubuntu.com/ubuntu-ports/ focal main restricted universe
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal main restricted universe
+
+deb http://ports.ubuntu.com/ubuntu-ports/ focal-updates main restricted universe
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal-updates main restricted universe
+
+deb http://ports.ubuntu.com/ubuntu-ports/ focal-backports main restricted
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal-backports main restricted
+
+deb http://ports.ubuntu.com/ubuntu-ports/ focal-security main restricted universe multiverse
+deb-src http://ports.ubuntu.com/ubuntu-ports/ focal-security main restricted universe multiverse
diff --git a/tools/cross/install_rootfs.sh b/tools/cross/install_rootfs.sh
index f6c59b9c5..fa045f4ad 100755
--- a/tools/cross/install_rootfs.sh
+++ b/tools/cross/install_rootfs.sh
@@ -27,8 +27,9 @@ __Apt=""
 __UbuntuPackages="build-essential"
 
 # other development supports
-__UbuntuPackages+=" libboost-all-dev ocl-icd-opencl-dev"
+__UbuntuPackages+=" ocl-icd-opencl-dev"
 __UbuntuPackages+=" libhdf5-dev"
+__UbuntuBoostPackages=" llibboost-all-dev"
 
 # symlinks fixer
 __UbuntuPackages+=" symlinks"
@@ -77,6 +78,10 @@ for i in "$@" ; do
         bionic)
             __LinuxCodeName=bionic
             ;;
+        focal)
+            __LinuxCodeName=focal
+            __UbuntuBoostPackages=" libboost1.67-all-dev"
+            ;;
         --setproxy*)
             proxyip="${i#*=}"
             __Apt="Acquire::http::proxy \"http://$proxyip/\";\n"
@@ -93,6 +98,9 @@ for i in "$@" ; do
     esac
 done
 
+# Current runtime build system supports boost version under 1.70
+__UbuntuPackages+="$__UbuntuBoostPackages"
+
 __RootfsDir="$__CrossDir/rootfs/$__BuildArch"
 
 if [[ -n "$ROOTFS_DIR" ]]; then
diff --git a/tools/release_tool/git_release.sh b/tools/release_tool/git_release.sh
index adba7df2f..00bf6bb70 100755
--- a/tools/release_tool/git_release.sh
+++ b/tools/release_tool/git_release.sh
@@ -5,7 +5,7 @@
 getopt --test > /dev/null
 if [ $? -ne 4 ]; then
   echo "[ERROR] Your system doesn't have enhanced getopt"
-  echo 2
+  exit 2
 fi
 
 function Usage()
@@ -25,6 +25,7 @@ function Usage()
   echo "--repo_owner       Owner of the repository"
   echo "--repo_name        The name of the repository"
   echo "--asset            Path of release asset"
+  echo "--asset_url        URL from which release asset is downloaded"
   echo ""
   echo "[EXAMPLE]"
   echo "$ ./git_release.sh --tag 1.9.0 --commitish release/1.9.0 --token 0de25f1ca5d1d758fe877b18c06 \\"
@@ -34,7 +35,8 @@ function Usage()
   echo "$ ./git_release.sh --tag v1.1 --commitish c024e85d0ce6cb1ed2fbc66f1a9c1c2814da7575 \\"
   echo "  --token 0de25f1ca5d1d758fe877b18c06 --repo_owner Samsung --repo_name ONE \\"
   echo "  --release_name \"Release Automation\" --release_note /home/mhs4670go/ONE/release_doc \\"
-  echo "  --host_name github.sec.company.net --draft"
+  echo "  --host_name github.sec.company.net --draft \\"
+  echo "  --asset_url \"http://one.server.com/artifacts/ONE-compiler.tar.gz\""
   echo ""
   echo "[REFERENCE]"
   echo "https://developer.github.com/v3/repos/releases/#create-a-release"
@@ -53,7 +55,8 @@ token:,\
 host_name:,\
 repo_owner:,\
 repo_name:,\
-asset:"
+asset:,\
+asset_url:"
 
 OPTS=$(getopt --options "$SHORT_OPTS" --longoptions "$LONG_OPTS" --name "$0" -- "$@")
 
@@ -71,6 +74,7 @@ unset REPO_OWNER
 unset REPO_NAME
 IS_DRAFT=false
 ASSET_PATHS=()
+ASSET_URLS=()
 
 while true ; do
   case "$1" in
@@ -118,6 +122,10 @@ while true ; do
       ASSET_PATHS+=("$2")
       shift 2
       ;;
+    --asset_url )
+      ASSET_URLS+=("$2")
+      shift 2
+      ;;
     -- )
       shift
       break
@@ -146,6 +154,12 @@ if [ -z ${USER_TOKEN} ]; then
   exit 0
 fi
 
+ASSETS_FROM_URL=()
+# Get asset name from url
+for ASSET_URL in "${ASSET_URLS[@]}"; do
+  ASSETS_FROM_URL+=($(basename "${ASSET_URL}"))
+done
+
 # Print variables and set default value
 DEFAULT_RELEASE_NAME="ONE Release ${TAG_NAME}"
 DEFAULT_HOST_NAME="api.github.com"
@@ -162,6 +176,7 @@ echo "HOST_NAME        : ${HOST_NAME:=${DEFAULT_HOST_NAME}}"
 echo "REPO_OWNER       : ${REPO_OWNER:=${DEFAULT_REPO_OWNER}}"
 echo "REPO_NAME        : ${REPO_NAME:=${DEFAULT_REPO_NAME}}"
 echo "ASSETS           : ${ASSET_PATHS[@]}"
+echo "ASSETS_FROM_URL  : ${ASSETS_FROM_URL[@]}"
 echo "==========================================================="
 
 function generate_release_data()
@@ -183,7 +198,7 @@ RELEASE_URL=$(curl -s --request GET --header "Authorization: token ${USER_TOKEN}
 https://${HOST_NAME}/repos/${REPO_OWNER}/${REPO_NAME}/releases/tags/${TAG_NAME} | \
 jq -r '.url')
 
-if [ $RELEASE_URL != null ]; then
+if [ "$RELEASE_URL" != null ]; then
   echo "[ERROR] The tag name you specified already exists."
   exit 2
 fi
@@ -197,10 +212,30 @@ jq -r '.upload_url')
 
 UPLOAD_URL=$(echo ${UPLOAD_URL} | cut -d "{" -f 1)?name=
 
-# Upload the assets
+# Download assets from url
+TMPDIR=$(mktemp -d)
+pushd $TMPDIR
+for ASSET_URL in "${ASSET_URLS[@]}"; do
+  wget "$ASSET_URL"
+done
+popd
+
+# Upload the assets from url
+for ASSET_NAME in "${ASSETS_FROM_URL[@]}"; do
+  ASSET_PATH="${TMPDIR}/${ASSET_NAME}"
+  curl -s --request POST --header "Authorization: token ${USER_TOKEN}" \
+  --header "Content-Type: $(file -b --mime-type ${ASSET_PATH})" \
+  --data-binary @${ASSET_PATH} \
+  ${UPLOAD_URL}${ASSET_NAME} > /dev/null
+done
+
+rm -rf ${TMPDIR}
+
+# Upload the assets from local
 for ASSET_PATH in "${ASSET_PATHS[@]}"; do
+  ASSET_BASENAME=$(basename ${ASSET_PATH})
   curl -s --request POST --header "Authorization: token ${USER_TOKEN}" \
   --header "Content-Type: $(file -b --mime-type ${ASSET_PATH})" \
   --data-binary @${ASSET_PATH} \
-  ${UPLOAD_URL}${ASSET_PATH} > /dev/null
+  ${UPLOAD_URL}${ASSET_BASENAME} > /dev/null
 done
diff --git a/tools/release_tool/onert_version.sh b/tools/release_tool/onert_version.sh
index eafe96e3d..55ac033ef 100755
--- a/tools/release_tool/onert_version.sh
+++ b/tools/release_tool/onert_version.sh
@@ -5,6 +5,7 @@ set -eu
 progname=$(basename "${BASH_SOURCE[0]}")
 script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 nnfw_root="$( cd "${script_dir%*/*/*}" && pwd )"
+nightly=1
 
 usage() {
   echo "Usage: $progname version"
@@ -12,7 +13,8 @@ usage() {
   echo ""
   echo "Options:"
   echo "    -h   show this help"
-  echo "    -s   set onert  version"
+  echo "    -n   show current onert version with nightly suffix"
+  echo "    -s   set onert version"
   echo ""
   echo "Examples:"
   echo "    $progname           => show current onert version"
@@ -22,7 +24,17 @@ usage() {
 
 show_version() {
   version_line=$(cat ${nnfw_root}/packaging/nnfw.spec | grep "Version:")
-  echo ${version_line#"Version:"}
+  current_version=${version_line#"Version:"}
+
+  if [ $nightly -eq 0 ]; then
+    # Get head commit's date
+    pushd $nnfw_root > /dev/null
+    date=$(git log -1 --format=%ad --date=format:%y%m%d)
+    echo $current_version-nightly-$date
+    popd > /dev/null
+  else
+    echo $current_version
+  fi
 
   exit 0
 }
@@ -43,9 +55,10 @@ if [ $# -eq 0 ]; then
   show_version
 fi
 
-while getopts "hs:" OPTION; do
+while getopts "hns:" OPTION; do
 case "${OPTION}" in
     h) usage;;
+    n) nightly=0; show_version;;
     s) set_version "$OPTARG";;
     ?) exit 1;;
 esac
diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
index 863edea57..a1aa6f263 100755
--- a/tools/tflitefile_tool/select_operator.py
+++ b/tools/tflitefile_tool/select_operator.py
@@ -218,15 +218,26 @@ def GenerateTensor(new_builder, selected_tensor, used_buffers_dic):
     if quantization != None:
         new_quantization = GenerateQuantization(new_builder, quantization)
 
+    # Create IsVariable
+    is_variable = selected_tensor.IsVariable()
+
+    # Create Sparsity
+    sparsity = selected_tensor.Sparsity()
+
     # Create tensor
     tflite.Tensor.TensorStart(new_builder)
     tflite.Tensor.TensorAddShape(new_builder, new_shape)
     tflite.Tensor.TensorAddType(new_builder, tensor_type)
-    tflite.Tensor.TensorAddBuffer(new_builder, new_buffer_idx)
+    if (new_buffer_idx != 0):
+        tflite.Tensor.TensorAddBuffer(new_builder, new_buffer_idx)
     if name_string != "":
         tflite.Tensor.TensorAddName(new_builder, new_name)
     if quantization != None:
         tflite.Tensor.TensorAddQuantization(new_builder, new_quantization)
+    tflite.Tensor.TensorAddIsVariable(new_builder, is_variable)
+
+    if sparsity != None:
+        tflite.Tensor.TensorAddSparsity(new_builder, sparsity)
 
     return tflite.Tensor.TensorEnd(new_builder)
 
@@ -871,7 +882,6 @@ def GenerateBuiltinOption(new_builder, selected_builtin_option, builtin_option_t
     # FillOptions: not supported
     # BidirectionalSequenceLSTMOptions: not supported
     # BidirectionalSequenceRNNOptions: not supported
-    # UnidirectionalSequenceLSTMOptions: not supported
     # FloorModOptions: not supported
     # RangeOptions: not supported
     # ResizeNearestNeighborOptions: not supported
@@ -901,6 +911,31 @@ def GenerateBuiltinOption(new_builder, selected_builtin_option, builtin_option_t
         tflite.SquaredDifferenceOptions.SquaredDifferenceOptionsStart(new_builder)
         return tflite.SquaredDifferenceOptions.SquaredDifferenceOptionsEnd(new_builder)
 
+    # UnidirectionalSequenceLSTMOptions
+    import tflite.UnidirectionalSequenceLSTMOptions
+    if builtin_option_type == tflite.BuiltinOptions.BuiltinOptions(
+    ).UnidirectionalSequenceLSTMOptions:
+
+        unidirectional_sequence_lstm_option = tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptions(
+        )
+        unidirectional_sequence_lstm_option.Init(selected_builtin_option.Bytes,
+                                                 selected_builtin_option.Pos)
+
+        tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsStart(
+            new_builder)
+        tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsAddFusedActivationFunction(
+            new_builder, unidirectional_sequence_lstm_option.FusedActivationFunction())
+        tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsAddCellClip(
+            new_builder, unidirectional_sequence_lstm_option.CellClip())
+        tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsAddProjClip(
+            new_builder, unidirectional_sequence_lstm_option.ProjClip())
+        tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsAddTimeMajor(
+            new_builder, unidirectional_sequence_lstm_option.TimeMajor())
+        tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsAddAsymmetricQuantizeInputs(
+            new_builder, unidirectional_sequence_lstm_option.AsymmetricQuantizeInputs())
+        return tflite.UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptionsEnd(
+            new_builder)
+
     # MirrorPadOptions: not supported
     # AbsOptions: not supported
     # SplitVOptions: not supported
@@ -1267,7 +1302,8 @@ def main(args):
     for used_tensor in used_tensors:
         # key and value is same in prepare phase
         buf_idx = (sample_subgraph.Tensors(used_tensor)).Buffer()
-        used_buffers.append(buf_idx)
+        if buf_idx != 0:
+            used_buffers.append(buf_idx)
 
     # Append buffers of tensors of child subgraphs
     for subgraph_idx in used_subgraphs_list:
@@ -1275,7 +1311,8 @@ def main(args):
             continue
         for tensor_idx in range(sample_model.Subgraphs(subgraph_idx).TensorsLength()):
             tensor = sample_model.Subgraphs(subgraph_idx).Tensors(tensor_idx)
-            used_buffers.append(tensor.Buffer())
+            if tensor.Buffer() != 0:
+                used_buffers.append(tensor.Buffer())
 
     used_buffers.sort()
 
@@ -1296,6 +1333,8 @@ def main(args):
     # Assign new index for buffer
     used_buffers_dic = {}
 
+    # Tensor has empty buffer if buffer index is 0.
+    used_buffers_dic[0] = 0
     for new_buffer_idx in range(len(used_buffers)):
         sample_buffer_idx = used_buffers[new_buffer_idx]
         used_buffers_dic[sample_buffer_idx] = new_buffer_idx
@@ -1315,7 +1354,7 @@ def main(args):
             if input_tensor_idx in new_input_tensors:
                 matched_buffer_idx = sample_subgraph.Tensors(input_tensor_idx).Buffer()
                 matched_buffer = sample_model.Buffers(matched_buffer_idx)
-                if matched_buffer.DataLength() != 0:
+                if matched_buffer_idx == 0 or matched_buffer.DataLength() != 0:
                     new_input_tensors.remove(input_tensor_idx)
 
         for output_idx in range(operator.OutputsLength()):
diff --git a/tools/tflitefile_tool/tensor_printer.py b/tools/tflitefile_tool/tensor_printer.py
index 7c2904346..108a119d6 100755
--- a/tools/tflitefile_tool/tensor_printer.py
+++ b/tools/tflitefile_tool/tensor_printer.py
@@ -51,6 +51,7 @@ class TensorPrinter(object):
             print_str = "Tensor {0:4}".format(self.tensor.tensor_idx)
         else:
             buffer_idx = self.tensor.tf_tensor.Buffer()
+            buffer_str = "Empty" if buffer_idx == 0 else str(buffer_idx)
             isEmpty = "Filled"
             if (self.tensor.tf_buffer.DataLength() == 0):
                 isEmpty = " Empty"
@@ -63,8 +64,8 @@ class TensorPrinter(object):
 
             memory_size = ConvertBytesToHuman(self.tensor.memory_size)
 
-            print_str = "Tensor {0:4} : buffer {1:4} | {2} | {3:7} | Memory {4:6} | Shape {5} ({6})".format(
-                self.tensor.tensor_idx, buffer_idx, isEmpty, type_name, memory_size,
+            print_str = "Tensor {0:4} : buffer {1:5} | {2} | {3:7} | Memory {4:6} | Shape {5} ({6})".format(
+                self.tensor.tensor_idx, buffer_str, isEmpty, type_name, memory_size,
                 shape_str, shape_name)
         print(depth_str + print_str)
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-10-28 12:16:55 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-10-28 12:16:55 +0900
commit	c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (patch)
tree	761ee8e171e5203f5c598ad93b2e7e0bc2e31aa2
parent	74476a2d0296bdad70a2f7f90bc7419a8b05bffd (diff)
download	nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.gz nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.bz2 nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.zip