Imported Upstream version 1.10.0upstream/1.10.0 submit/tizen/20201028.104702 submit/tizen/20201028.031836 accepted/tizen/unified/20201029.124827

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-10-28 12:16:55 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-10-28 12:16:55 +0900
commit: c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (patch)
tree: 761ee8e171e5203f5c598ad93b2e7e0bc2e31aa2 /runtime
parent: 74476a2d0296bdad70a2f7f90bc7419a8b05bffd (diff)
download: nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.gz
nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.bz2
nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.zip
220 files changed, 8212 insertions, 6602 deletions
diff --git a/runtime/contrib/android/api/Android.mk b/runtime/contrib/android/api/Android.mk
index a056eff9d..3c768cca5 100644
--- a/runtime/contrib/android/api/Android.mk
+++ b/runtime/contrib/android/api/Android.mk
@@ -4,7 +4,5 @@ include $(CLEAR_VARS)
 API_ROOT_PATH := $(LOCAL_PATH)
 PREBUILT_LIB :=
 
-include $(API_ROOT_PATH)/prebuilt/Android.mk
+include $(API_ROOT_PATH)/Prebuilt.mk
 include $(API_ROOT_PATH)/src/main/native/Android.mk
-
-#$(warning $(PREBUILT_LIB))
diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk
new file mode 100644
index 000000000..7d9f56582
--- /dev/null
+++ b/runtime/contrib/android/api/Prebuilt.mk
@@ -0,0 +1,70 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+ifndef ONERT_PREBUILT_LIB_DIR
+$(error ONERT_PREBUILT_LIB_DIR is not set)
+endif
+
+# libcircle_loader
+include $(CLEAR_VARS)
+LOCAL_MODULE := circle_loader
+PREBUILT_LIB += circle_loader
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libcircle_loader.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libtflite_loader
+include $(CLEAR_VARS)
+LOCAL_MODULE := tflite_loader
+PREBUILT_LIB += tflite_loader
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libtflite_loader.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libtensorflowlite_jni
+include $(CLEAR_VARS)
+LOCAL_MODULE := tensorflowlite_jni
+PREBUILT_LIB += tensorflowlite_jni
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libtensorflowlite_jni.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libnnfw
+include $(CLEAR_VARS)
+LOCAL_MODULE := nnfw-dev
+PREBUILT_LIB += nnfw-dev
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libnnfw-dev.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# libonert_core
+include $(CLEAR_VARS)
+LOCAL_MODULE := onert_core
+PREBUILT_LIB += onert_core
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libonert_core.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# backend_cpu
+include $(CLEAR_VARS)
+LOCAL_MODULE := backend_cpu
+PREBUILT_LIB += backend_cpu
+LOCAL_SRC_FILES := \
+		$(ONERT_PREBUILT_LIB_DIR)/libbackend_cpu.so
+include $(PREBUILT_SHARED_LIBRARY)
+
+# TODO Support backend acl
+# backend_acl
+ifeq ($(ONERT_CONTAINS_ACL), 1)
+	$(error containing acl backend doesn't supported yet)
+endif
+
+# backend_ext
+ifneq ($(ONERT_EXT_PREBUILT_LIB), )
+include $(CLEAR_VARS)
+LOCAL_MODULE := backend_ext
+PREBUILT_LIB += backend_ext
+LOCAL_SRC_FILES := \
+		$(ONERT_EXT_PREBUILT_LIB)
+include $(PREBUILT_SHARED_LIBRARY)
+endif
diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle
index def89eeac..afc53d936 100644
--- a/runtime/contrib/android/api/build.gradle
+++ b/runtime/contrib/android/api/build.gradle
@@ -8,11 +8,39 @@ android {
         minSdkVersion 26
         targetSdkVersion 29
         versionCode 1
-        versionName "1.9.0"
+        versionName "1.10.0"
 
         externalNativeBuild {
             ndkBuild {
-                arguments "ONERT_API_INC_DIR=${project.projectDir}/../../../onert/api/include"
+                def onert_header_dir
+                if (project.hasProperty('onertHeaderDir'))
+                    onert_header_dir = project.onertHeaderDir
+                else
+                    onert_header_dir = "${project.projectDir}/../../../onert/api/include"
+
+                def onert_lib_dir
+                if (project.hasProperty('onertLibDir'))
+                    onert_lib_dir = project.onertLibDir
+                else
+                    onert_lib_dir = "${project.projectDir}/../../../../Product/out/lib"
+
+                def onert_contains_acl
+                if (project.hasProperty('onertContainsAcl'))
+                    onert_contains_acl = 1
+                else
+                    onert_contains_acl = 0
+
+                def onert_ext_lib
+                if (project.hasProperty('onertExtLib'))
+                    onert_ext_lib = project.onertExtLib
+                else
+                    onert_ext_lib = ""
+
+                arguments "ONERT_API_INC_DIR=$onert_header_dir",
+                          "ONERT_PREBUILT_LIB_DIR=$onert_lib_dir",
+                          "ONERT_CONTAINS_ACL=$onert_contains_acl",
+                          "ONERT_EXT_PREBUILT_LIB=$onert_ext_lib"
+
                 abiFilters 'arm64-v8a'
             }
         }
diff --git a/runtime/contrib/android/api/prebuilt/Android.mk b/runtime/contrib/android/api/prebuilt/Android.mk
deleted file mode 100644
index e8a9f0755..000000000
--- a/runtime/contrib/android/api/prebuilt/Android.mk
+++ /dev/null
@@ -1,9 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-PREBUILT_PATH := $(LOCAL_PATH)
-include $(PREBUILT_PATH)/backend_cpu/Android.mk
-include $(PREBUILT_PATH)/circle_loader/Android.mk
-include $(PREBUILT_PATH)/nnfw-dev/Android.mk
-include $(PREBUILT_PATH)/onert_core/Android.mk
-include $(PREBUILT_PATH)/tensorflowlite_jni/Android.mk
-include $(PREBUILT_PATH)/tflite_loader/Android.mk
diff --git a/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk b/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk
deleted file mode 100644
index ccda9ea90..000000000
--- a/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := backend_cpu
-PREBUILT_LIB += backend_cpu
-LOCAL_SRC_FILES := \
-		libbackend_cpu.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so b/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so
deleted file mode 120000
index 3d577cf5c..000000000
--- a/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libbackend_cpu.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk b/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk
deleted file mode 100644
index 2e481e93e..000000000
--- a/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := circle_loader
-PREBUILT_LIB += circle_loader
-LOCAL_SRC_FILES := \
-		libcircle_loader.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so b/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so
deleted file mode 120000
index 528d7017f..000000000
--- a/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libcircle_loader.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk b/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk
deleted file mode 100644
index 10cb8f6f4..000000000
--- a/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := nnfw-dev
-PREBUILT_LIB += nnfw-dev
-LOCAL_SRC_FILES := \
-		libnnfw-dev.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so b/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so
deleted file mode 120000
index 1913db8d7..000000000
--- a/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libnnfw-dev.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/onert_core/Android.mk b/runtime/contrib/android/api/prebuilt/onert_core/Android.mk
deleted file mode 100644
index a6682a24f..000000000
--- a/runtime/contrib/android/api/prebuilt/onert_core/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := onert_core
-PREBUILT_LIB += onert_core
-LOCAL_SRC_FILES := \
-		libonert_core.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so b/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so
deleted file mode 120000
index bafe11cb9..000000000
--- a/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libonert_core.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk b/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk
deleted file mode 100644
index 823cf0747..000000000
--- a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := tensorflowlite_jni
-PREBUILT_LIB += tensorflowlite_jni
-LOCAL_SRC_FILES := \
-		libtensorflowlite_jni.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so b/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so
deleted file mode 120000
index d3d72a5a7..000000000
--- a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libtensorflowlite_jni.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk b/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk
deleted file mode 100644
index 135ac1dad..000000000
--- a/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_MODULE := tflite_loader
-PREBUILT_LIB += tflite_loader
-LOCAL_SRC_FILES := \
-		libtflite_loader.so
-include $(PREBUILT_SHARED_LIBRARY)
diff --git a/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so b/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so
deleted file mode 120000
index 4c001aec0..000000000
--- a/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../Product/out/lib/libtflite_loader.so
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
index 1644e0f7f..209264d31 100644
--- a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
+++ b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp
@@ -121,8 +121,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet
 
   if (jni::setInput(handle, params) == false)
   {
-    __android_log_print(ANDROID_LOG_ERROR, JTAG, "%s] failed native setOutput",
-                        __PRETTY_FUNCTION__);
+    __android_log_print(ANDROID_LOG_ERROR, JTAG, "%s] failed native setInput", __PRETTY_FUNCTION__);
     return JNI_FALSE;
   }
 
diff --git a/runtime/contrib/android_benchmark_app/CMakeLists.txt b/runtime/contrib/android_benchmark_app/CMakeLists.txt
index 55dbf0024..beb279cb9 100644
--- a/runtime/contrib/android_benchmark_app/CMakeLists.txt
+++ b/runtime/contrib/android_benchmark_app/CMakeLists.txt
@@ -55,7 +55,7 @@ target_link_libraries(android_benchmark_native nnfw_lib_tflite)
 target_link_libraries(android_benchmark_native nnfw_lib_misc)
 target_link_libraries(android_benchmark_native log)
 
-nnas_find_package(FlatBuffersSource EXACT 1.11 REQUIRED)
+nnas_find_package(FlatBuffersSource EXACT 1.12 REQUIRED)
 target_include_directories(android_benchmark_native PUBLIC ${FlatBuffersSource_DIR}/include .)
 
 add_custom_target(android-benchmark-apk ALL
diff --git a/runtime/libs/ndarray/src/ContiguousSpan.cpp b/runtime/libs/benchmark/include/benchmark/MemoryInfo.h
index e06cfc2a1..6e8e12ba4 100644
--- a/runtime/libs/ndarray/src/ContiguousSpan.cpp
+++ b/runtime/libs/benchmark/include/benchmark/MemoryInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,27 @@
  * limitations under the License.
  */
 
-#include "ndarray/ContiguousSpan.h"
+#ifndef __NNFW_BENCHMARK_MEMORY_INFO_H__
+#define __NNFW_BENCHMARK_MEMORY_INFO_H__
 
-namespace ndarray
+#include <cstdint>
+#include <string>
+
+namespace benchmark
 {
 
-template class ContiguousSpan<float, true>;
-template class ContiguousSpan<float, false>;
-template class ContiguousSpan<int32_t, true>;
-template class ContiguousSpan<int32_t, false>;
-template class ContiguousSpan<uint32_t, true>;
-template class ContiguousSpan<uint32_t, false>;
-template class ContiguousSpan<uint8_t, true>;
-template class ContiguousSpan<uint8_t, false>;
+bool prepareVmRSS();
+bool prepareVmHWM();
+bool prepareGpuMemory();
+bool preparePssSum();
+
+uint32_t getVmRSS();
+uint32_t getVmHWM();
+uint32_t getGpuMemory(const std::string &process_name);
+uint32_t getPssSum();
+
+std::string getProcessName();
+
+} // namespace benchmark
 
-} // namespace ndarray
+#endif // __NNFW_BENCHMARK_MEMORY_INFO_H__
diff --git a/runtime/libs/benchmark/include/benchmark/MemoryPoller.h b/runtime/libs/benchmark/include/benchmark/MemoryPoller.h
index 48caa3b3a..47db3fd77 100644
--- a/runtime/libs/benchmark/include/benchmark/MemoryPoller.h
+++ b/runtime/libs/benchmark/include/benchmark/MemoryPoller.h
@@ -57,10 +57,6 @@ public:
 private:
   void process();
   bool prepareMemoryPolling();
-  uint32_t getVmRSS();
-  uint32_t getVmHWM();
-  uint32_t getGpuMemory();
-  uint32_t getPssSum();
 
 private:
   std::chrono::milliseconds _duration;
diff --git a/runtime/libs/benchmark/include/benchmark/Phases.h b/runtime/libs/benchmark/include/benchmark/Phases.h
index 936a89742..7d642782a 100644
--- a/runtime/libs/benchmark/include/benchmark/Phases.h
+++ b/runtime/libs/benchmark/include/benchmark/Phases.h
@@ -50,6 +50,9 @@ public:
   const MemoryPoller &mem_poll() const { return *_mem_poll; }
   const Phase &at(const std::string &tag) const { return _phases.at(tag); }
 
+  uint32_t mem_before_init() const { return _mem_before_init; }
+  uint32_t mem_after_run() const { return _mem_after_run; }
+
 private:
   void run(const std::string &tag, const PhaseFunc &exec, const PhaseFunc *post, uint32_t loop_num,
            bool option_disable);
@@ -58,6 +61,8 @@ private:
   const PhaseOption _option;
   std::unordered_map<std::string, Phase> _phases;
   std::unique_ptr<MemoryPoller> _mem_poll;
+  uint32_t _mem_before_init;
+  uint32_t _mem_after_run;
 };
 
 } // namespace benchmark
diff --git a/runtime/libs/benchmark/include/benchmark/Result.h b/runtime/libs/benchmark/include/benchmark/Result.h
index 69084b300..7604aa904 100644
--- a/runtime/libs/benchmark/include/benchmark/Result.h
+++ b/runtime/libs/benchmark/include/benchmark/Result.h
@@ -34,6 +34,8 @@ public:
   double time[PhaseEnum::END_OF_PHASE][FigureType::END_OF_FIG_TYPE];
   uint32_t memory[PhaseEnum::END_OF_PHASE][MemoryType::END_OF_MEM_TYPE];
   bool print_memory = false;
+  uint32_t init_memory = 0;
+  uint32_t peak_memory = 0;
 };
 
 // TODO Support not only stdout but also ostream
diff --git a/runtime/libs/benchmark/src/MemoryInfo.cpp b/runtime/libs/benchmark/src/MemoryInfo.cpp
new file mode 100644
index 000000000..20d262961
--- /dev/null
+++ b/runtime/libs/benchmark/src/MemoryInfo.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark/MemoryInfo.h"
+
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+#include <cassert>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+namespace
+{
+
+const std::string proc_status_path("/proc/self/status");
+const std::string gpu_memory_path("/sys/kernel/debug/mali0/gpu_memory");
+const std::string proc_smaps_path("/proc/self/smaps");
+
+bool isStrNumber(const std::string &s)
+{
+  return !s.empty() &&
+         std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end();
+}
+
+std::vector<std::string> splitLine(std::string line, std::string delimiters = " \n\t")
+{
+  std::vector<std::string> words;
+  size_t prev = 0, pos;
+
+  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos)
+  {
+    if (pos > prev)
+      words.emplace_back(line.substr(prev, pos - prev));
+    prev = pos + 1;
+  }
+
+  if (prev < line.length())
+    words.emplace_back(line.substr(prev, std::string::npos));
+
+  return words;
+}
+
+std::vector<std::string> getValueFromFileStatus(const std::string &file, const std::string &key)
+{
+  std::ifstream ifs(file);
+  assert(ifs.is_open());
+
+  std::string line;
+  std::vector<std::string> val;
+
+  bool found = false;
+  while (std::getline(ifs, line))
+  {
+    if (line.find(key) != std::string::npos)
+    {
+      found = true;
+      break;
+    }
+  }
+  ifs.close();
+
+  if (!found)
+  {
+    // NOTE. the process which uses gpu resources cannot be there yet at the model-load phase.
+    // At that time, just return empty.
+    return val;
+  }
+
+  val = splitLine(line);
+  return val;
+}
+
+// Because of smaps' structure, returns sum value as uint32_t
+uint32_t getSumValueFromFileSmaps(const std::string &file, const std::string &key)
+{
+  std::ifstream ifs(file);
+  assert(ifs.is_open());
+
+  std::string line;
+  uint32_t sum = 0;
+  while (std::getline(ifs, line))
+  {
+    if (line.find(key) != std::string::npos)
+    {
+      // an example by splitLine()
+      // `Pss:                   0 kB`
+      // val[0]: "Pss:", val[1]: "0" val[2]: "kB"
+      auto val = splitLine(line);
+      assert(val.size() != 0);
+      // SwapPss could show so that check where Pss is at the beginning
+      if (val[0].find("Pss") != 0)
+      {
+        continue;
+      }
+      sum += std::stoul(val[1]);
+    }
+  }
+
+  return sum;
+}
+
+} // namespace
+
+namespace benchmark
+{
+
+bool prepareVmRSS() { return std::ifstream(proc_status_path).is_open(); }
+
+bool prepareVmHWM() { return std::ifstream(proc_status_path).is_open(); }
+
+bool prepareGpuMemory() { return std::ifstream(gpu_memory_path).is_open(); }
+
+bool preparePssSum() { return std::ifstream(proc_smaps_path).is_open(); }
+
+uint32_t getVmRSS()
+{
+  auto val = getValueFromFileStatus(proc_status_path, "VmRSS");
+  if (val.size() == 0)
+    return 0;
+  assert(isStrNumber(val[1]));
+  return std::stoul(val[1]);
+}
+
+uint32_t getVmHWM()
+{
+  auto val = getValueFromFileStatus(proc_status_path, "VmHWM");
+  if (val.size() == 0)
+    return 0;
+  // key: value
+  assert(isStrNumber(val[1]));
+  return std::stoul(val[1]);
+}
+
+uint32_t getGpuMemory(const std::string &process_name)
+{
+  assert(!process_name.empty());
+  auto val = getValueFromFileStatus(gpu_memory_path, process_name);
+  if (val.size() == 0)
+    return 0;
+  // process_name -> pid -> gpu_mem -> max_gpu_mem
+  assert(isStrNumber(val[2]));
+  return std::stoul(val[2]);
+}
+
+uint32_t getPssSum() { return getSumValueFromFileSmaps(proc_smaps_path, "Pss"); }
+
+std::string getProcessName()
+{
+  auto val = getValueFromFileStatus(proc_status_path, "Name");
+  assert(val.size() >= 2);
+  return val[1];
+}
+
+} // namespace benchmark
diff --git a/runtime/libs/benchmark/src/MemoryPoller.cpp b/runtime/libs/benchmark/src/MemoryPoller.cpp
index 61fdecd46..050b5b163 100644
--- a/runtime/libs/benchmark/src/MemoryPoller.cpp
+++ b/runtime/libs/benchmark/src/MemoryPoller.cpp
@@ -16,106 +16,13 @@
 
 #include "benchmark/MemoryPoller.h"
 #include "benchmark/Types.h"
+#include "benchmark/MemoryInfo.h"
 
 #include <vector>
-#include <fstream>
-#include <sstream>
 #include <stdexcept>
 #include <cassert>
 #include <iostream>
 
-namespace
-{
-
-const std::string proc_status_path("/proc/self/status");
-const std::string gpu_memory_path("/sys/kernel/debug/mali0/gpu_memory");
-const std::string proc_smaps_path("/proc/self/smaps");
-
-bool isStrNumber(const std::string &s)
-{
-  return !s.empty() &&
-         std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end();
-}
-
-std::vector<std::string> splitLine(std::string line, std::string delimiters = " \n\t")
-{
-  std::vector<std::string> words;
-  size_t prev = 0, pos;
-
-  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos)
-  {
-    if (pos > prev)
-      words.emplace_back(line.substr(prev, pos - prev));
-    prev = pos + 1;
-  }
-
-  if (prev < line.length())
-    words.emplace_back(line.substr(prev, std::string::npos));
-
-  return words;
-}
-
-std::vector<std::string> getValueFromFileStatus(const std::string &file, const std::string &key)
-{
-  std::ifstream ifs(file);
-  assert(ifs.is_open());
-
-  std::string line;
-  std::vector<std::string> val;
-
-  bool found = false;
-  while (std::getline(ifs, line))
-  {
-    if (line.find(key) != std::string::npos)
-    {
-      found = true;
-      break;
-    }
-  }
-  ifs.close();
-
-  if (!found)
-  {
-    // NOTE. the process which uses gpu resources cannot be there yet at the model-load phase.
-    // At that time, just return empty.
-    return val;
-  }
-
-  val = splitLine(line);
-  return val;
-}
-
-// Because of smaps' structure, returns sum value as uint32_t
-uint32_t getSumValueFromFileSmaps(const std::string &file, const std::string &key)
-{
-  std::ifstream ifs(file);
-  assert(ifs.is_open());
-
-  std::string line;
-  uint32_t sum = 0;
-  while (std::getline(ifs, line))
-  {
-    if (line.find(key) != std::string::npos)
-    {
-      // an example by splitLine()
-      // `Pss:                   0 kB`
-      // val[0]: "Pss:", val[1]: "0" val[2]: "kB"
-      auto val = splitLine(line);
-      assert(val.size() != 0);
-      // SwapPss could show so that check where Pss is at the beginning
-      if (val[0].find("Pss") != 0)
-      {
-        continue;
-      }
-      sum += std::stoul(val[1]);
-    }
-  }
-
-  return sum;
-}
-
-} // namespace
-
 namespace benchmark
 {
 
@@ -168,7 +75,7 @@ bool MemoryPoller::end(PhaseEnum phase)
   mem = getVmRSS();
   if (_gpu_poll)
   {
-    mem += getGpuMemory();
+    mem += getGpuMemory(_process_name);
   }
   if (mem > _rss_map[phase])
     _rss_map[phase] = mem;
@@ -176,7 +83,7 @@ bool MemoryPoller::end(PhaseEnum phase)
   mem = getVmHWM();
   if (_gpu_poll)
   {
-    mem += getGpuMemory();
+    mem += getGpuMemory(_process_name);
   }
   _hwm_map[phase] = mem;
 
@@ -208,7 +115,7 @@ void MemoryPoller::process()
     uint32_t cur_hwm = getVmHWM();
     if (_gpu_poll)
     {
-      auto gpu_mem = getGpuMemory();
+      auto gpu_mem = getGpuMemory(_process_name);
       cur_rss += gpu_mem;
       cur_hwm += gpu_mem;
     }
@@ -236,77 +143,33 @@ void MemoryPoller::process()
 bool MemoryPoller::prepareMemoryPolling()
 {
   // VmRSS
+  if (!prepareVmRSS())
   {
-    std::ifstream ifs(proc_status_path);
-    if (!ifs.is_open())
-    {
-      std::cerr << "failed to open " << proc_status_path << std::endl;
-      return false;
-    }
-    ifs.close();
+    std::cerr << "failed to prepare parsing vmrss" << std::endl;
+    return false;
   }
 
   // (Additionally) GpuMemory
   if (_gpu_poll)
   {
-    std::ifstream ifs(gpu_memory_path);
-    if (!ifs.is_open())
+    if (!prepareGpuMemory())
     {
-      std::cerr << "failed to open " << gpu_memory_path << std::endl;
+      std::cerr << "failed to prepare parsing gpu memory" << std::endl;
       return false;
     }
-    ifs.close();
 
     // Needs process name
-    auto val = getValueFromFileStatus(proc_status_path, "Name");
-    assert(val.size() != 0);
-    _process_name = val[1];
+    _process_name = getProcessName();
   }
 
   // PSS
+  if (!preparePssSum())
   {
-    std::ifstream ifs(proc_smaps_path);
-    if (!ifs.is_open())
-    {
-      std::cerr << "failed to open " << proc_smaps_path << std::endl;
-      return false;
-    }
-    ifs.close();
+    std::cerr << "failed to prepare parsing pss sum" << std::endl;
+    return false;
   }
 
   return true;
 }
 
-uint32_t MemoryPoller::getVmRSS()
-{
-  auto val = getValueFromFileStatus(proc_status_path, "VmRSS");
-  if (val.size() == 0)
-    return 0;
-  assert(isStrNumber(val[1]));
-  return std::stoul(val[1]);
-}
-
-uint32_t MemoryPoller::getVmHWM()
-{
-  auto val = getValueFromFileStatus(proc_status_path, "VmHWM");
-  if (val.size() == 0)
-    return 0;
-  // key: value
-  assert(isStrNumber(val[1]));
-  return std::stoul(val[1]);
-}
-
-uint32_t MemoryPoller::getGpuMemory()
-{
-  assert(!_process_name.empty());
-  auto val = getValueFromFileStatus(gpu_memory_path, _process_name);
-  if (val.size() == 0)
-    return 0;
-  // process_name -> pid -> gpu_mem -> max_gpu_mem
-  assert(isStrNumber(val[2]));
-  return std::stoul(val[2]);
-}
-
-uint32_t MemoryPoller::getPssSum() { return getSumValueFromFileSmaps(proc_smaps_path, "Pss"); }
-
 } // namespace benchmark
diff --git a/runtime/libs/benchmark/src/Phases.cpp b/runtime/libs/benchmark/src/Phases.cpp
index 9ab67cfd9..897b943d3 100644
--- a/runtime/libs/benchmark/src/Phases.cpp
+++ b/runtime/libs/benchmark/src/Phases.cpp
@@ -17,6 +17,7 @@
 
 #include "benchmark/Phases.h"
 #include "benchmark/Types.h"
+#include "benchmark/MemoryInfo.h"
 
 #include <cassert>
 #include <chrono>
@@ -46,8 +47,11 @@ void SleepForMicros(uint64_t micros)
 namespace benchmark
 {
 
-Phases::Phases(const PhaseOption &option) : _option(option)
+Phases::Phases(const PhaseOption &option) : _option(option), _mem_before_init(0), _mem_after_run(0)
 {
+  assert(prepareVmRSS());
+  _mem_before_init = getVmHWM();
+
   if (_option.memory)
   {
     _mem_poll = std::make_unique<MemoryPoller>(std::chrono::milliseconds(option.memory_interval),
@@ -93,6 +97,8 @@ void Phases::run(const std::string &tag, const PhaseFunc &exec, const PhaseFunc
     }
   }
 
+  _mem_after_run = getVmHWM();
+
   if (p == PhaseEnum::END_OF_PHASE)
   {
     return;
diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
index df573da92..e6cafb91c 100644
--- a/runtime/libs/benchmark/src/Result.cpp
+++ b/runtime/libs/benchmark/src/Result.cpp
@@ -141,6 +141,15 @@ void printResultMemory(const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE]
   }
 }
 
+void printUsedPeakMemory(uint32_t init_memory, uint32_t peak_memory)
+{
+  uint32_t used_peak_memory = peak_memory - init_memory;
+  std::cout << "Used Peak Memory : " << used_peak_memory << " kb" << std::endl;
+  std::cout << "- HWM after run  : " << peak_memory << " kb" << std::endl;
+  std::cout << "- HWM before init: " << init_memory << " kb" << std::endl;
+  std::cout << "===================================" << std::endl;
+}
+
 } // namespace
 
 namespace benchmark
@@ -175,6 +184,8 @@ Result::Result(const Phases &phases)
       }
     }
   }
+  init_memory = phases.mem_before_init();
+  peak_memory = phases.mem_after_run();
 }
 
 void printResult(const Result &result)
@@ -185,6 +196,7 @@ void printResult(const Result &result)
     return;
 
   printResultMemory(result.memory);
+  printUsedPeakMemory(result.init_memory, result.peak_memory);
 }
 
 // TODO There are necessary for a kind of output data file so that it doesn't have to be csv file
diff --git a/runtime/libs/misc/include/misc/polymorphic_downcast.h b/runtime/libs/misc/include/misc/polymorphic_downcast.h
index 412b864e6..ee885eb70 100644
--- a/runtime/libs/misc/include/misc/polymorphic_downcast.h
+++ b/runtime/libs/misc/include/misc/polymorphic_downcast.h
@@ -27,9 +27,7 @@ namespace misc
 
 template <typename DstType, typename SrcType> inline DstType polymorphic_downcast(SrcType *x)
 {
-#ifndef __ANDROID__
   assert(dynamic_cast<DstType>(x) == x);
-#endif
   return static_cast<DstType>(x);
 }
 
diff --git a/runtime/libs/ndarray/CMakeLists.txt b/runtime/libs/ndarray/CMakeLists.txt
deleted file mode 100644
index b040f5115..000000000
--- a/runtime/libs/ndarray/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-add_library(ndarray STATIC src/Array.cpp src/ContiguousSpan.cpp)
-
-set_target_properties(ndarray PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-target_include_directories(ndarray PUBLIC include)
-#can't make this private because of c++ templates
-target_include_directories(ndarray PUBLIC src)
-
-option(NDARRAY_INLINE_TEMPLATES "Set to ON to disable extern declarations for common types")
-
-if(${NDARRAY_INLINE_TEMPLATES})
-    target_compile_definitions(ndarray PUBLIC -DNDARRAY_INLINE_TEMPLATES=1)
-endif()
-
-target_link_libraries(ndarray PRIVATE nnfw_common)
-target_link_libraries(ndarray PRIVATE nnfw_coverage)
-
-add_subdirectory(test)
-add_subdirectory(example)
diff --git a/runtime/libs/ndarray/example/CMakeLists.txt b/runtime/libs/ndarray/example/CMakeLists.txt
deleted file mode 100644
index c4b575dad..000000000
--- a/runtime/libs/ndarray/example/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_executable(example_no_array example_no_array.cpp)
-
-add_executable(example_array example_array.cpp)
-target_link_libraries(example_array PRIVATE ndarray)
diff --git a/runtime/libs/ndarray/example/example_array.cpp b/runtime/libs/ndarray/example/example_array.cpp
deleted file mode 100644
index 85d274681..000000000
--- a/runtime/libs/ndarray/example/example_array.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ndarray/Array.h"
-
-#include <iostream>
-#include <iterator>
-
-using namespace ndarray;
-
-void gather_array(const Array<float> &input, Array<float> &output, const Array<int> &indices)
-{
-  assert(indices.shape().rank() == 3);
-  assert(input.shape().rank() == 3);
-  assert(indices.shape().dim(1) == input.shape().rank());
-
-  for (size_t i = 0; i < indices.shape().dim(0); ++i)
-  {
-    for (size_t j = 0; j < indices.shape().dim(1); ++j)
-    {
-      auto index = indices.slice(i, j);
-      output.slice(i, j).assign(input.slice(index[0], index[1]));
-    }
-  }
-}
-
-int main()
-{
-  // fill tensor of shape[3,3,4] with sequential numbers from [0..36)
-  Shape in_shape{3, 3, 4};
-  std::vector<float> input_data(in_shape.element_count());
-  for (size_t i = 0; i < in_shape.element_count(); ++i)
-    input_data[i] = i;
-
-  Array<float> input(input_data.data(), in_shape);
-
-  // select column-vectors on main diagonal
-  Shape indices_shape{1, 3, 2};
-  std::vector<int> indices_data(indices_shape.element_count());
-  Array<int> indices(indices_data.data(), indices_shape);
-
-  indices.slice(0, 0) = {0, 0};
-  indices.slice(0, 1) = {1, 1};
-  indices.slice(0, 2) = {2, 2};
-
-  Shape output_shape{1, 3, 4};
-  std::vector<float> output_data(output_shape.element_count());
-
-  Array<float> output(output_data.data(), output_shape);
-
-  gather_array(input, output, indices);
-
-  for (size_t i = 0; i < indices_shape.dim(0); ++i)
-  {
-    for (size_t j = 0; j < indices_shape.dim(1); ++j)
-    {
-      auto output_piece = output.slice(i, j);
-      std::ostream_iterator<int> cout_it(std::cout, ", ");
-      std::copy(output_piece.begin(), output_piece.end(), cout_it);
-      std::cout << std::endl;
-    }
-  }
-}
diff --git a/runtime/libs/ndarray/example/example_no_array.cpp b/runtime/libs/ndarray/example/example_no_array.cpp
deleted file mode 100644
index 3a4d05dca..000000000
--- a/runtime/libs/ndarray/example/example_no_array.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <array>
-#include <vector>
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-
-void gather_no_array(const float *in_data, const std::array<size_t, 3> &dims, float *out_data,
-                     const std::array<size_t, 3> &out_dims, //[nselections,
-                     const int *indices, const std::array<size_t, 3> &indices_dims)
-{
-  assert(indices_dims[1] == dims.size());
-
-  for (int i = 0; i < indices_dims[0]; ++i)
-  {
-    for (int j = 0; j < indices_dims[1]; ++j)
-    {
-      const int *index_ptr = indices + i * indices_dims[2] * indices_dims[1] + j * indices_dims[2];
-
-      size_t in_offset = index_ptr[0] * dims[2] * dims[1] + index_ptr[1] * dims[2];
-
-      const float *in_ptr = in_data + in_offset;
-
-      size_t out_offset = i * out_dims[2] * out_dims[1] + j * out_dims[2];
-
-      float *out_ptr = out_data + out_offset;
-
-      for (int k = 0; k < dims[2]; ++k)
-      {
-        out_ptr[k] = in_ptr[k];
-      }
-    }
-  }
-}
-
-int main()
-{
-  std::array<size_t, 3> in_dims{3, 3, 4};
-  std::vector<float> input(3 * 3 * 4);
-  for (size_t i = 0; i < 3 * 3 * 4; ++i)
-    input[i] = i;
-
-  std::array<size_t, 3> indices_shape{1, 3, 2};
-  std::vector<int> indices(1 * 3 * 2);
-
-  indices[0] = 0;
-  indices[1] = 0;
-  indices[2] = 1;
-  indices[3] = 1;
-  indices[4] = 2;
-  indices[5] = 2;
-
-  std::array<size_t, 3> output_dims{1, 3, 4};
-  std::vector<float> output(1 * 3 * 4);
-
-  gather_no_array(input.data(), in_dims, output.data(), output_dims, indices.data(), indices_shape);
-
-  for (size_t i = 0; i < output_dims[0]; ++i)
-  {
-    for (size_t j = 0; j < output_dims[1]; ++j)
-    {
-      auto out_ptr = output.data() + i * output_dims[1] * output_dims[2] + j * output_dims[2];
-      for (size_t k = 0; k < output_dims[2]; ++k)
-      {
-        std::cout << out_ptr[k] << ", ";
-      }
-      std::cout << std::endl;
-    }
-  }
-}
diff --git a/runtime/libs/ndarray/include/ndarray/Array.h b/runtime/libs/ndarray/include/ndarray/Array.h
deleted file mode 100644
index 3890cc26b..000000000
--- a/runtime/libs/ndarray/include/ndarray/Array.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_ARRAY_H_
-#define _NDARRAY_ARRAY_H_
-
-#include "Common.h"
-
-#include "ContiguousSpan.h"
-#include "Shape.h"
-
-#if __cplusplus < 201402L
-#include "detail/cxx14.h" //integer_sequence and make_index_dequence definitions
-#else
-#include <utility>
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <type_traits>
-#include <array>
-#include <tuple>
-#include <cstddef>
-
-namespace ndarray
-{
-
-// there is no index_sequence before c++14
-#if __cplusplus < 201402L
-
-template <size_t... Nums> using index_sequence = cxx14::index_sequence<Nums...>;
-
-template <size_t Num> using make_index_sequence = cxx14::make_index_sequence<Num>;
-
-#else
-
-template <size_t... Nums> using index_sequence = std::index_sequence<Nums...>;
-
-template <size_t _Num> using make_index_sequence = std::make_index_sequence<_Num>;
-
-#endif //__cplusplus < 201402L
-
-struct Strides
-{
-  explicit Strides(Shape s) : _strides{} { fillStrides(s); }
-
-  int operator[](size_t idx) const noexcept { return _strides[idx]; }
-
-  // since we don't have c++14 fold expression
-  template <typename Seq, typename... Ts> struct _calc_offset;
-
-  template <size_t Num, size_t... Nums, typename T, typename... Ts>
-  struct _calc_offset<index_sequence<Num, Nums...>, T, Ts...>
-  {
-    static constexpr size_t get(const std::array<int, 8> &strides, int x, Ts... xs)
-    {
-      return _calc_offset<index_sequence<Nums...>, Ts...>::get(strides, xs...) +
-             x * std::get<Num>(strides);
-    }
-  };
-
-  template <size_t Num, typename T> struct _calc_offset<index_sequence<Num>, T>
-  {
-    static constexpr size_t get(const std::array<int, 8> &strides, int x)
-    {
-      return x * std::get<Num>(strides);
-    }
-  };
-
-  template <typename Seq, typename... Ts> constexpr size_t offset(Seq, Ts... x) const noexcept
-  {
-    // return ( 0 + ... + (std::get<Nums>(_strides) * x)); in c++14
-    return _calc_offset<Seq, Ts...>::get(_strides, x...);
-  }
-
-private:
-  void fillStrides(const Shape &s) noexcept
-  {
-    int rank = s.rank();
-    _strides[rank - 1] = 1;
-    for (int d = rank - 2; d >= 0; --d)
-    {
-      _strides[d] = _strides[d + 1] * s.dim(d + 1);
-    }
-  }
-
-  std::array<int, NDARRAY_MAX_DIMENSION_COUNT> _strides;
-};
-
-template <typename T> class Array
-{
-public:
-  Array(T *data, Shape shape) noexcept : _data(data), _shape(shape), _strides(shape) {}
-
-  Array(const Array &) = delete;
-
-  Array(Array &&a) noexcept : _data(a._data), _shape(a._shape), _strides(a._strides)
-  {
-    a._data = nullptr;
-  }
-
-  template <typename... Ts> T &at(Ts... x) const noexcept { return _at(static_cast<size_t>(x)...); }
-
-  /**
-   * @brief returns last dimension as ContigniousSpan
-   * @param x indices of slice to take. See tests for usage details
-   * @return slice at given position
-   */
-  template <typename... Ts> ContiguousSpan<T, std::is_const<T>::value> slice(Ts... x) noexcept
-  {
-    assert(sizeof...(Ts) == _shape.rank() - 1);
-    return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)};
-  }
-
-  /**
-   * @brief returns last dimension as ContigniousSpan
-   * @param x indices of slice to take. See tests for usage details
-   * @return slice at given position
-   */
-  template <typename... Ts> ContiguousSpan<T, true> slice(Ts... x) const noexcept
-  {
-    assert(sizeof...(Ts) == _shape.rank() - 1);
-    return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)};
-  }
-
-  ContiguousSpan<T, std::is_const<T>::value> flat() noexcept
-  {
-    return {_data, _shape.element_count()};
-  }
-
-  ContiguousSpan<T, true> flat() const noexcept { return {_data, _shape.element_count()}; }
-
-  const Shape &shape() const noexcept { return _shape; }
-
-private:
-  template <typename... Ts> T &_at(Ts... x) const noexcept
-  {
-    assert(sizeof...(x) == _shape.rank());
-    using Indices = make_index_sequence<sizeof...(Ts)>;
-    return _data[offset(Indices{}, x...)];
-  }
-
-  template <typename... Ts, size_t... Nums>
-  size_t offset(index_sequence<Nums...> seq, Ts... x) const noexcept
-  {
-    static_assert(
-        sizeof...(Ts) == sizeof...(Nums),
-        "Sanity check failed. Generated index sequence size is not equal to argument count");
-
-    return _strides.offset(seq, x...);
-  }
-
-  T *_data;
-  Shape _shape;
-  Strides _strides;
-};
-
-template <typename To, typename From> Array<To> array_cast(Array<From> &&from, Shape newShape)
-{
-  assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count());
-  return Array<To>(reinterpret_cast<To *>(from.flat().data()), newShape);
-}
-
-template <typename To, typename From>
-Array<const To> array_cast(const Array<From> &from, Shape newShape)
-{
-  assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count());
-  return Array<To>(reinterpret_cast<const To *>(from.flat().data()), newShape);
-}
-
-#ifndef NDARRAY_INLINE_TEMPLATES
-
-extern template class Array<float>;
-extern template class Array<int32_t>;
-extern template class Array<uint32_t>;
-extern template class Array<uint8_t>;
-
-#endif // NDARRAY_INLINE_TEMPLATES
-
-} // namespace ndarray
-
-#endif //_NDARRAY_ARRAY_H_
diff --git a/runtime/libs/ndarray/include/ndarray/Common.h b/runtime/libs/ndarray/include/ndarray/Common.h
deleted file mode 100644
index aa0cc6fe2..000000000
--- a/runtime/libs/ndarray/include/ndarray/Common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_COMMON_H_
-#define _NDARRAY_COMMON_H_
-
-#define NDARRAY_MAX_DIMENSION_COUNT 8
-
-#endif //_NDARRAY_COMMON_H_
diff --git a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h b/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h
deleted file mode 100644
index 8caa6a686..000000000
--- a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_CONTIGNIOUS_SPAN_H_
-#define _NDARRAY_CONTIGNIOUS_SPAN_H_
-
-#include <type_traits>
-#include <vector>
-#include <cstdint>
-#include <cstddef>
-#include <cassert>
-
-namespace ndarray
-{
-
-template <typename T, bool isConst = false> class ContiguousSpan
-{
-public:
-  using pointer_type = typename std::conditional<isConst, const T *, T *>::type;
-  using reference_type = typename std::conditional<isConst, const T &, T &>::type;
-  using iterator_type = pointer_type;
-
-  ContiguousSpan(pointer_type data, size_t len) noexcept : _data(data), _len(len) {}
-
-  template <typename It>
-  explicit ContiguousSpan(It first, It last) noexcept
-      : _data(&*first), _len(std::distance(first, last))
-  {
-  }
-
-  ContiguousSpan(const ContiguousSpan &) = delete;
-
-  ContiguousSpan(ContiguousSpan &&s) noexcept : _data(s._data), _len(s._len) { s._data = nullptr; }
-
-  operator ContiguousSpan<T, true>() { return ContiguousSpan<T, true>{_data, _len}; }
-
-  reference_type operator[](size_t idx) const noexcept { return _data[idx]; }
-
-  reference_type at(size_t idx) const noexcept { return _data[idx]; }
-
-  ContiguousSpan<T, isConst> offset(size_t offset)
-  {
-    assert(offset <= _len);
-    return {_data + offset, _len - offset};
-  }
-
-  template <typename From, bool _ = isConst>
-  typename std::enable_if<!_, void>::type assign(const From &f) noexcept
-  {
-    assignFrom(std::begin(f), std::end(f));
-  }
-
-  template <typename U, bool _ = isConst>
-  typename std::enable_if<!_, ContiguousSpan &>::type
-  operator=(std::initializer_list<U> list) noexcept
-  {
-    assignFrom(std::begin(list), std::end(list));
-    return *this;
-  }
-
-  template <typename It, bool _ = isConst>
-  typename std::enable_if<!_, void>::type assignFrom(It first, It last) noexcept
-  {
-    std::copy(first, last, begin());
-  }
-
-  size_t size() const { return _len; }
-
-  iterator_type begin() const { return iterator_type{_data}; }
-
-  iterator_type end() const { return iterator_type{_data + _len}; }
-
-  pointer_type data() { return _data; }
-
-private:
-  pointer_type _data;
-  size_t _len;
-};
-
-#ifndef NDARRAY_INLINE_TEMPLATES
-
-extern template class ContiguousSpan<float, true>;
-extern template class ContiguousSpan<float, false>;
-extern template class ContiguousSpan<int32_t, true>;
-extern template class ContiguousSpan<int32_t, false>;
-extern template class ContiguousSpan<uint32_t, true>;
-extern template class ContiguousSpan<uint32_t, false>;
-extern template class ContiguousSpan<uint8_t, true>;
-extern template class ContiguousSpan<uint8_t, false>;
-
-#endif // NDARRAY_INLINE_TEMPLATES
-
-} // namespace ndarray
-
-#endif //_NDARRAY_CONTIGNIOUS_SPAN_H_
diff --git a/runtime/libs/ndarray/include/ndarray/Shape.h b/runtime/libs/ndarray/include/ndarray/Shape.h
deleted file mode 100644
index fa58613b8..000000000
--- a/runtime/libs/ndarray/include/ndarray/Shape.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_SHAPE_H_
-#define _NDARRAY_SHAPE_H_
-
-#include "Common.h"
-
-#include <array>
-#include <cassert>
-#include <cstddef>
-
-namespace ndarray
-{
-
-class Shape
-{
-public:
-  //_dims{} here and later since array does not have std::initializer_list ctor
-  // and aggregate initialization is not allowed here
-  explicit Shape(size_t rank) noexcept : _dims{}, _rank(rank)
-  {
-    std::fill(_dims.begin(), _dims.end(), 0);
-  }
-
-  Shape(std::initializer_list<size_t> list) noexcept : _dims{}, _rank(list.size())
-  {
-    std::copy(list.begin(), list.end(), _dims.begin());
-  }
-
-  size_t dim(int i) const noexcept { return _dims.at(i); }
-
-  size_t &dim(int i) noexcept { return _dims.at(i); }
-
-  size_t element_count() const noexcept
-  {
-    uint32_t res = 1;
-    for (size_t i = 0; i < rank(); ++i)
-      res *= dim(i);
-    assert(res <= 0xffffffff);
-    return res;
-  }
-
-  size_t rank() const noexcept { return _rank; }
-
-private:
-  std::array<size_t, NDARRAY_MAX_DIMENSION_COUNT> _dims;
-  size_t _rank;
-};
-
-} // namespace ndarray
-
-#endif //_NDARRAY_SHAPE_H_
diff --git a/runtime/libs/ndarray/src/detail/cxx14.h b/runtime/libs/ndarray/src/detail/cxx14.h
deleted file mode 100644
index 81135b3f2..000000000
--- a/runtime/libs/ndarray/src/detail/cxx14.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _NDARRAY_CXX14_H_
-#define _NDARRAY_CXX14_H_
-
-namespace ndarray
-{
-
-namespace cxx14
-{
-
-template <size_t... Nums> struct index_sequence
-{
-  using value_type = size_t;
-
-  static constexpr std::size_t size() noexcept { return sizeof...(Nums); }
-};
-
-namespace detail
-{
-
-template <size_t v, typename Seq> struct _append;
-
-template <size_t v, size_t... Nums> struct _append<v, index_sequence<Nums...>>
-{
-  using result = index_sequence<Nums..., v>;
-};
-
-template <size_t Len> struct make_index_sequence
-{
-  using result =
-      typename detail::_append<Len - 1, typename make_index_sequence<Len - 1>::result>::result;
-};
-
-template <> struct make_index_sequence<1>
-{
-  using result = index_sequence<0>;
-};
-
-template <> struct make_index_sequence<0>
-{
-  using result = index_sequence<>;
-};
-
-} // namespace detail
-
-template <size_t Num> using make_index_sequence = typename detail::make_index_sequence<Num>::result;
-
-} // namespace cxx14
-
-} // namespace ndarray
-
-#endif //_NDARRAY_CXX14_H_
diff --git a/runtime/libs/ndarray/test/CMakeLists.txt b/runtime/libs/ndarray/test/CMakeLists.txt
deleted file mode 100644
index 16f8779ee..000000000
--- a/runtime/libs/ndarray/test/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-if(NOT BUILD_NDARRAY_TEST)
-    return()
-endif()
-
-add_executable(ndarray_test ndarray_test.cpp)
-
-target_link_libraries(ndarray_test PRIVATE ndarray)
-
-nnfw_find_package(GTest)
-if(NOT GTest_FOUND)
-    message(STATUS "GTest not avaialble. Skipping NDArray test build")
-    return()
-endif(NOT GTest_FOUND)
-
-target_link_libraries(ndarray_test PUBLIC gtest gtest_main ${LIB_PTHREAD})
-
-add_test(ndarray_test ndarray_test)
diff --git a/runtime/libs/ndarray/test/ndarray_test.cpp b/runtime/libs/ndarray/test/ndarray_test.cpp
deleted file mode 100644
index 0aa948c72..000000000
--- a/runtime/libs/ndarray/test/ndarray_test.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gtest/gtest.h"
-
-#include "ndarray/Array.h"
-
-using namespace ndarray;
-
-TEST(NDArray_tests, basic_data_test)
-{
-
-  float raw_data[] = {1, 2, 3, 4};
-
-  Array<float> data22{raw_data, {2, 2}};
-
-  ASSERT_FLOAT_EQ(data22.at(0, 0), 1);
-  ASSERT_FLOAT_EQ(data22.at(0, 1), 2);
-  ASSERT_FLOAT_EQ(data22.at(1, 0), 3);
-  ASSERT_FLOAT_EQ(data22.at(1, 1), 4);
-
-  Array<float> data14{raw_data, {1, 4}};
-  ASSERT_FLOAT_EQ(data22.at(0, 0), 1);
-  ASSERT_FLOAT_EQ(data22.at(0, 1), 2);
-  ASSERT_FLOAT_EQ(data22.at(0, 2), 3);
-  ASSERT_FLOAT_EQ(data22.at(0, 3), 4);
-}
-
-TEST(NDArray_tests, slice_write_test)
-{
-  float raw_data[4] = {0};
-
-  Array<float> data22{raw_data, {2, 2}};
-
-  data22.slice(1) = {1, 2};
-
-  ASSERT_FLOAT_EQ(data22.at(0, 0), 0);
-  ASSERT_FLOAT_EQ(data22.at(0, 1), 0);
-  ASSERT_FLOAT_EQ(data22.at(1, 0), 1);
-  ASSERT_FLOAT_EQ(data22.at(1, 1), 2);
-}
-
-TEST(NDArray_tests, slice_read_test)
-{
-  float raw_data[4] = {1, 2, 3, 4};
-
-  Array<float> data22{raw_data, {2, 2}};
-
-  auto slice = data22.slice(1);
-
-  ASSERT_FLOAT_EQ(slice[0], 3);
-  ASSERT_FLOAT_EQ(slice[1], 4);
-}
-
-TEST(NDArray_tests, multidim_test)
-{
-  float raw_data[5] = {0, 1, 2, 3, 4};
-
-  Array<float> data22{raw_data, {1, 1, 1, 1, 5}};
-
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 0), 0);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 1), 1);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 2), 2);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 3), 3);
-  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 4), 4);
-}
-
-TEST(NDArray_tests, slice_assign_test)
-{
-  std::vector<float> v1{1, 2, 3, 4, 5};
-  std::vector<float> v2(5);
-
-  ContiguousSpan<float> span1(v1.begin(), v1.end());
-  ContiguousSpan<float> span2(v2.begin(), v2.end());
-
-  span2.assign(span1);
-
-  ASSERT_EQ(v1, v2);
-}
diff --git a/runtime/libs/nnapi/CMakeLists.txt b/runtime/libs/nnapi/CMakeLists.txt
index a5d9490d1..73f82b909 100644
--- a/runtime/libs/nnapi/CMakeLists.txt
+++ b/runtime/libs/nnapi/CMakeLists.txt
@@ -1,3 +1,4 @@
-add_subdirectories()
+add_library(nnfw_lib_nnapi INTERFACE)
 
-add_library(nnfw_lib_nnapi ALIAS nnfw_lib_nnapi_1_2)
+target_include_directories(nnfw_lib_nnapi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_link_libraries(nnfw_lib_nnapi INTERFACE nnfw-nnapi-header)
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksExShim.h b/runtime/libs/nnapi/include/NeuralNetworksExShim.h
index 855613241..855613241 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksExShim.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksExShim.h
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksLoadHelpers.h b/runtime/libs/nnapi/include/NeuralNetworksLoadHelpers.h
index 1c482b54c..1c482b54c 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksLoadHelpers.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksLoadHelpers.h
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksShim.h b/runtime/libs/nnapi/include/NeuralNetworksShim.h
index 80082383f..80082383f 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksShim.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksShim.h
diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksTypes.h b/runtime/libs/nnapi/include/NeuralNetworksTypes.h
index d74402749..d74402749 100644
--- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksTypes.h
+++ b/runtime/libs/nnapi/include/NeuralNetworksTypes.h
diff --git a/runtime/libs/nnapi/v1.1/CMakeLists.txt b/runtime/libs/nnapi/v1.1/CMakeLists.txt
deleted file mode 100644
index dc018c60f..000000000
--- a/runtime/libs/nnapi/v1.1/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_library(nnfw_lib_nnapi_1_1 INTERFACE)
-
-target_include_directories(nnfw_lib_nnapi_1_1 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(nnfw_lib_nnapi_1_1 INTERFACE nnfw-nnapi-header)
diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h
deleted file mode 100644
index f684dab90..000000000
--- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * @file     NeuralNetworksExShim.h
- * @brief    This file contains an actual implementation of
- *           ANeuralNetworksModel_addOperationEx function
- * @ingroup  COM_AI_RUNTIME
- */
-
-#ifndef NN_API_EX_SHIM_H
-#define NN_API_EX_SHIM_H
-
-#include "NeuralNetworksEx.h"
-#include "NeuralNetworksLoadHelpers.h"
-
-typedef int (*ANeuralNetworksModel_addOperationEx_fn)(ANeuralNetworksModel *model,
-                                                      ANeuralNetworksOperationTypeEx type,
-                                                      uint32_t inputCount, const uint32_t *inputs,
-                                                      uint32_t outputCount,
-                                                      const uint32_t *outputs);
-
-/**
- * @brief Add an extended operation to a model.
- *
- * @param[in] model The model to be modified.
- * @param[in] type The type of extended operation.
- * @param[in] inputCount The number of entries in the inputs array.
- * @param[in] inputs An array of indexes identifying each operand.
- * @param[in] outputCount The number of entries in the outputs array.
- * @param[in] outputs An array of indexes identifying each operand.
- *
- * @note The operands specified by inputs and outputs must have been
- *       previously added by calls to {@link ANeuralNetworksModel_addOperand}.\n
- *       Attempting to modify a model once {@link ANeuralNetworksModel_finish}
- *       has been called will return an error.\n
- *       See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-
-inline int ANeuralNetworksModel_addOperationEx(ANeuralNetworksModel *model,
-                                               ANeuralNetworksOperationTypeEx type,
-                                               uint32_t inputCount, const uint32_t *inputs,
-                                               uint32_t outputCount, const uint32_t *outputs)
-{
-  LOAD_FUNCTION(ANeuralNetworksModel_addOperationEx);
-  EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount, outputs);
-}
-
-#endif // NN_API_EX_SHIM_H
diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h
deleted file mode 100644
index 201465f9c..000000000
--- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// NOTE To minimize diff with upstream tensorflow, disable clang-format
-// clang-format off
-
-// NOTE This header is derived from part of the following file (in TensorFlow v1.12)
-//       'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h'
-
-/**
- * @file NeuralNetworksLoadHelpers.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains functions to load NN API runtime library
- */
-
-#ifndef __NEURAL_NETWORKS_LOAD_HELPER_H__
-#define __NEURAL_NETWORKS_LOAD_HELPER_H__
-
-#include <dlfcn.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/**
- * @brief Print log data
- * @param[in] format    Format string of @c printf
- * @param[in] args      Argument after format string. (Same with @c printf)
- */
-#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__);
-
-/**
- * @brief Create a function pointer named @c fn after loading NN API library
- * @param[in] name    Name of a function
- */
-#define LOAD_FUNCTION(name) \
-  static name##_fn fn = reinterpret_cast<name##_fn>(nnfw::loadFunction(#name));
-
-/**
- * @brief Run @c fn function. @c fn is created by @ref LOAD_FUNCTION
- * @param[in] args    List of arguments for the function @c fn
- */
-#define EXECUTE_FUNCTION(...) \
-  if (fn != nullptr) {        \
-    fn(__VA_ARGS__);          \
-  }
-
-/**
- * @brief Run @c fn function. @c fn is created by @ref LOAD_FUNCTION
- * @param[in] args    List of arguments for the function @c fn
- * @return            the return value of @c fn
- */
-#define EXECUTE_FUNCTION_RETURN(...) return fn != nullptr ? fn(__VA_ARGS__) : 0;
-
-namespace nnfw
-{
-
-/**
- * @brief Load NN API library
- * @param[in] name path of NN API library
- * @return a symbol table handle of NN API library
- */
-inline void* loadLibrary(const char* name) {
-  // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn
-  // api RT
-  void* handle = nullptr;
-#if 1 //#ifdef __ANDROID__
-  handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-  if (handle == nullptr) {
-    NNAPI_LOG("nnapi error: unable to open library %s", name);
-    NNAPI_LOG("             %s", dlerror());
-  }
-#endif
-  return handle;
-}
-
-/**
- * @brief Load libneuralnetworks.so and return handle of library
- * @return a symbol table handle of NN API library
- */
-inline void* getLibraryHandle() {
-  static void* handle = loadLibrary("libneuralnetworks.so");
-  return handle;
-}
-
-/**
- * @brief Return function ptr in libneuralnetworks.so
- * @param[in] name    Name of function
- * @return function pointer
- */
-inline void* loadFunction(const char* name) {
-  void* fn = nullptr;
-  if (getLibraryHandle() != nullptr) {
-    fn = dlsym(getLibraryHandle(), name);
-  }
-  if (fn == nullptr) {
-    NNAPI_LOG("nnapi error: unable to open function %s", name);
-    NNAPI_LOG("             %s", dlerror());
-    abort();
-  }
-  else {
-#ifdef _GNU_SOURCE
-    Dl_info info;
-    if (dladdr(fn, &info))
-    {
-      NNAPI_LOG("nnapi function '%s' is loaded from '%s' ", name, info.dli_fname);
-    }
-    else
-    {
-      NNAPI_LOG("nnapi function '%s' is failed to load", name);
-    }
-
-#endif // _GNU_SOURCE
-  }
-  return fn;
-}
-
-/**
- * @brief Check if libneuralnetworks.so can be loaded
- * @return @c true if loading is successful, otherwise @c false.
- */
-inline bool NNAPIExists() {
-  static bool nnapi_is_available = getLibraryHandle();
-  return nnapi_is_available;
-}
-
-} // namespace nnfw
-
-#endif // __NEURAL_NETWORKS_LOAD_HELPER_H__
diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h
deleted file mode 100644
index 60b16f766..000000000
--- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
-   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// NOTE To minimize diff with upstream tensorflow, disable clang-format
-// clang-format off
-
-// NOTE This header is derived from part of the following file (in TensorFlow v1.12)
-//       'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h'
-#ifndef __NEURAL_NETWORKS_SHIM__
-#define __NEURAL_NETWORKS_SHIM__
-
-#include "NeuralNetworks.h"
-#include "NeuralNetworksLoadHelpers.h"
-
-// nn api function types
-
-typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
-    size_t size, int protect, int fd, size_t offset,
-    ANeuralNetworksMemory** memory);
-
-typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
-
-typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
-
-typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
-
-typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
-
-typedef int (*ANeuralNetworksCompilation_create_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
-
-typedef void (*ANeuralNetworksCompilation_free_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
-    ANeuralNetworksCompilation* compilation, int32_t preference);
-
-typedef int (*ANeuralNetworksCompilation_finish_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksModel_addOperand_fn)(
-    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
-
-typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
-    ANeuralNetworksModel* model, int32_t index, const void* buffer,
-    size_t length);
-
-typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
-    ANeuralNetworksModel* model, int32_t index,
-    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksModel_addOperation_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
-    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
-    const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
-    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
-    uint32_t outputCount, const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
-    ANeuralNetworksModel* model, bool allow);
-
-typedef int (*ANeuralNetworksExecution_create_fn)(
-    ANeuralNetworksCompilation* compilation,
-    ANeuralNetworksExecution** execution);
-
-typedef void (*ANeuralNetworksExecution_free_fn)(
-    ANeuralNetworksExecution* execution);
-
-typedef int (*ANeuralNetworksExecution_setInput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_startCompute_fn)(
-    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
-
-typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
-
-typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
-
-/**
- * Creates a shared memory object from a file descriptor.
- *
- * The shared memory is backed by a file descriptor via mmap.
- * See {@link ANeuralNetworksMemory} for a description on how to use
- * this shared memory.
- *
- * @param size The requested size in bytes.
- *             Must not be larger than the file size.
- * @param prot The desired memory protection for the mapping.
- *             It is either PROT_NONE or the bitwise OR of one or
- *             more of the following flags: PROT_READ, PROT_WRITE.
- * @param fd The requested file descriptor.
- *           The file descriptor has to be mmap-able. The file
- *           descriptor will be duplicated.
- * @param offset The offset to the beginning of the file of the area to map.
- *               The offset has to be aligned to a page size.
- * @param memory The memory object to be created.
- *               Set to NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
- */
-inline int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd,
-                                              size_t offset,
-                                              ANeuralNetworksMemory** memory) {
-  LOAD_FUNCTION(ANeuralNetworksMemory_createFromFd);
-  EXECUTE_FUNCTION_RETURN(size, protect, fd, offset, memory);
-}
-
-/**
- * Delete a memory object.
- *
- * Destroys the object used by the run time to keep track of the memory.
- * This will free the underlying actual memory if no other code has open
- * handles to this memory.
- *
- * @param memory The memory object to be freed.
- */
-inline void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
-  LOAD_FUNCTION(ANeuralNetworksMemory_free);
-  EXECUTE_FUNCTION(memory);
-}
-
-/**
- * Create an empty {@link ANeuralNetworksModel}.
- *
- * <p>This only creates the object. Computation is performed once
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
- *
- * The model should be constructed with calls to
- * {@link ANeuralNetworksModel_addOperation} and
- * {@link ANeuralNetworksModel_addOperand}
- *
- * <p>{@link ANeuralNetworksModel_finish} should be called once the model
- * has been fully constructed.</p>
- *
- * <p>{@link ANeuralNetworksModel_free} should be called once the model
- * is no longer needed.</p>
- *
- * @param model The {@link ANeuralNetworksModel} to be created.
- *              Set to NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
-  LOAD_FUNCTION(ANeuralNetworksModel_create);
-  EXECUTE_FUNCTION_RETURN(model);
-}
-
-/**
- * Destroy a model.
- *
- * The model need not have been finished by a call to
- * {@link ANeuralNetworksModel_finish}.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be destroyed. Passing NULL is acceptable and
- *              results in no operation.
- */
-inline void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
-  LOAD_FUNCTION(ANeuralNetworksModel_free);
-  EXECUTE_FUNCTION(model);
-}
-
-/**
- * Indicate that we have finished modifying a model. Required before
- * calling {@link ANeuralNetworksCompilation_compile}.
- *
- * An application is responsible to make sure that no other thread uses
- * the model at the same time.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be finished.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
-  LOAD_FUNCTION(ANeuralNetworksModel_finish);
-  EXECUTE_FUNCTION_RETURN(model);
-}
-
-/**
- * Add an operand to a model.
- *
- * The order in which the operands are added is important. The first one added
- * to a model will have the index value 0, the second 1, etc. These indexes are
- * used as operand identifiers in {@link ANeuralNetworksModel_addOperation},
- * {@link ANeuralNetworksExecution_setInput},
- * {@link ANeuralNetworksExecution_setInputFromMemory},
- * {@link ANeuralNetworksExecution_setOutput},
- * {@link ANeuralNetworksExecution_setOutputFromMemory} and
- * {@link ANeuralNetworksExecution_setOperandValue}.
- *
- * To build a model that can accommodate inputs of various sizes, as you may
- * want to do for a CNN, set the size of the dimensions that will vary at run
- * time to 0. If you do so, provide the full dimensions when calling
- * {@link ANeuralNetworksExecution_setInput} or {@link
- * ANeuralNetworksExecution_setInputFromMemory}.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be modified.
- * @param type The {@link ANeuralNetworksOperandType} that describes the shape
- * of the operand.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_addOperand(
-    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) {
-  LOAD_FUNCTION(ANeuralNetworksModel_addOperand);
-  EXECUTE_FUNCTION_RETURN(model, type);
-}
-
-/**
- * Sets an operand to a constant value.
- *
- * For scalar values, the content of buffer is copied into the model.
- *
- * For tensor values, a pointer to the buffer is stored within the model.
- * The application is responsible for not changing the content of this region
- * until all executions using this model have completed. As the data may
- * be copied during processing, modifying the data after this call yields
- * undefined results.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param buffer A pointer to the data to use.
- * @param length The size in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model,
-                                                int32_t index,
-                                                const void* buffer,
-                                                size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValue);
-  EXECUTE_FUNCTION_RETURN(model, index, buffer, length);
-}
-
-/**
- * Sets an operand to a value stored in a memory object.
- *
- * The content of the memory is not copied. A reference to that memory is stored
- * inside the model. The application is responsible for not changing the content
- * of the memory region until all executions using this model have completed.
- * As the data may be copied during processing, modifying the data after this
- * call yields undefined results.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param buffer A pointer to the data to use.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory.
- *               The offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_setOperandValueFromMemory(
-    ANeuralNetworksModel* model, int32_t index,
-    const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValueFromMemory);
-  EXECUTE_FUNCTION_RETURN(model, index, memory, offset, length);
-}
-
-/**
- * Add an operation to a model.
- *
- * @param model The model to be modified.
- * @param type The type of the operation.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying each operand.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying each operand.
- *
- * The operands specified by inputs and outputs must have been
- * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
-                                             ANeuralNetworksOperationType type,
-                                             uint32_t inputCount,
-                                             const uint32_t* inputs,
-                                             uint32_t outputCount,
-                                             const uint32_t* outputs) {
-  LOAD_FUNCTION(ANeuralNetworksModel_addOperation);
-  EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount,
-                          outputs);
-}
-
-/**
- * Specifies which operands will be the model's inputs and outputs.
- *
- * An operand cannot be used for both input and output. Doing so will
- * return an error.
- *
- * @param model The model to be modified.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying the input operands.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying the output operands.
- *
- * The operands specified by inputs and outputs must have been
- * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- *
- */
-inline int ANeuralNetworksModel_identifyInputsAndOutputs(
-    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
-    uint32_t outputCount, const uint32_t* outputs) {
-  LOAD_FUNCTION(ANeuralNetworksModel_identifyInputsAndOutputs);
-  EXECUTE_FUNCTION_RETURN(model, inputCount, inputs, outputCount, outputs);
-}
-
-/**
- * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
- * calculated with range and/or precision as low as that of the IEEE 754 16-bit
- * floating-point format. By default, {@link ANEURALNETWORKS_TENSOR_FLOAT32}
- * must be calculated using at least the range and precision of the IEEE 754
- * 32-bit floating-point format.
- *
- * @param model The model to be modified.
- * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
- *              calculated with range and/or precision as low as that of the
- *              IEEE 754 16-bit floating point format. 'false' indicates
- *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using
- *              at least the range and precision of the IEEE 754 32-bit floating
- *              point format.
- *
- * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
- * been called will return an error.
- *
- * Available since API level 28.
- *
- * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- */
-inline int ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-    ANeuralNetworksModel* model, bool allow) {
-  LOAD_FUNCTION(ANeuralNetworksModel_relaxComputationFloat32toFloat16);
-  EXECUTE_FUNCTION_RETURN(model, allow);
-}
-
-/**
- * Create a {@link ANeuralNetworksCompilation} to compile the given model.
- * This only creates the object. Compilation is only performed once
- * {@link ANeuralNetworksCompilation_start} is invoked.
- *
- * <p>The provided model must outlive the compilation.</p>
- *
- * The model must already have been finished by a call to
- * {@link ANeuralNetworksModel_finish}.
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @param model The {@link ANeuralNetworksModel} to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
- *         if the model is invalid.
- */
-inline int ANeuralNetworksCompilation_create(
-    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_create);
-  EXECUTE_FUNCTION_RETURN(model, compilation);
-}
-
-/**
- * Destroy a compilation.
- *
- * <p>If called on a compilation for which
- * {@link ANeuralNetworksCompilation_start} has been called, the
- * function will return immediately but will mark the compilation to be deleted
- * once the compilation completes. The {@link ANeuralNetworksCompilation_wait}
- * will return ERROR_DELETED.
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @param compilation The compilation to be destroyed. Passing NULL is
- * acceptable and results in no operation.
- */
-inline void ANeuralNetworksCompilation_free(
-    ANeuralNetworksCompilation* compilation) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_free);
-  EXECUTE_FUNCTION(compilation);
-}
-
-/**
- * Sets the execution preference.
- *
- * <p>Provides guidance to the runtime when trade-offs are possible.</p>
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @param compilation The compilation to be modified.
- * @param preference Either {@link PREFER_LOW_POWER},
- *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
- *                  {@link PREFER_SUSTAINED_SPEED}.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksCompilation_setPreference(
-    ANeuralNetworksCompilation* compilation, int32_t preference) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_setPreference);
-  EXECUTE_FUNCTION_RETURN(compilation, preference);
-}
-
-/**
- * Waits until the compilation completes.
- *
- * More than one thread can wait on a compilation. When the compilation
- * completes, all threads will be released.
- *
- * See {@link ANeuralNetworksCompilation} for information on multithreaded
- * usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
- */
-inline int ANeuralNetworksCompilation_finish(
-    ANeuralNetworksCompilation* compilation) {
-  LOAD_FUNCTION(ANeuralNetworksCompilation_finish);
-  EXECUTE_FUNCTION_RETURN(compilation);
-}
-/**
- * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
- * This only creates the object. Computation is only performed once
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
- *
- * <p>The provided compilation must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
- * @param execution The newly created object or NULL if unsuccessful.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
- *         if the compilation is invalid.
- */
-inline int ANeuralNetworksExecution_create(
-    ANeuralNetworksCompilation* compilation,
-    ANeuralNetworksExecution** execution) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_create);
-  EXECUTE_FUNCTION_RETURN(compilation, execution);
-}
-
-/**
- * Destroy an execution.
- *
- * <p>If called on an execution for which
- * {@link ANeuralNetworksExecution_startCompute} has been called, the
- * function will return immediately but will mark the execution to be deleted
- * once the computation completes.   The {link ANeuralNetworksExecution_wait}
- * will return ANEURALNETWORKS_ERROR_DELETED.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be destroyed. Passing NULL is acceptable
- * and results in no operation.
- */
-inline void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_free);
-  EXECUTE_FUNCTION(execution);
-}
-
-/**
- * Associate a user buffer with an input of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided buffer must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This should be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other properties of the type must be the same as
- *             specified in the model. If the type is the same as specified
- *             when the model was built, NULL can be passed.
- * @param buffer The buffer containing the data.
- * @param length The length in bytes of the buffer.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the input.
- */
-inline int ANeuralNetworksExecution_setInput(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const void* buffer, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setInput);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
-}
-
-/**
- * Associate part of a memory object with an input of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided memory must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This can be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other values must be the same as specified in the
- *             model. If the type is the same as specified when the model
- *             was built, NULL can be passed.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory.
- *               The offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the input.
- */
-inline int ANeuralNetworksExecution_setInputFromMemory(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setInputFromMemory);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
-}
-
-/**
- * Associate a user buffer with an output of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided buffer must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This can be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other values must be the same as specified in the
- *             model. If the type is the same as specified when the model
- *             was built, NULL can be passed.
- * @param buffer The buffer where the data is to be written.
- * @param length The length in bytes of the buffer.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the output.
- */
-inline int ANeuralNetworksExecution_setOutput(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, void* buffer, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setOutput);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
-}
-
-/**
- * Associate part of a memory object with an output of the model of the
- * {@link ANeuralNetworksExecution}.
- *
- * <p>The provided memory must outlive the execution.</p>
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is
- *              an index into the lists passed to
- *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
- *              the index associated with {@link
- * ANeuralNetworksModel_addOperand}.
- * @param type The type of the operand. This can be used to specify the
- *             dimensions that were set to 0 when the operand was added to the
- *             model. All other values must be the same as specified in the
- *             model. If the type is the same as specified when the model
- *             was built, NULL can be passed.
- * @param memory The memory where the data is to be stored.
- * @param offset This specifies the location of the data within the memory.
- *               The offset is in bytes from the start of memory.
- * @param length The length in bytes of the data value.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
- * the name is not recognized or the buffer is too small for the output.
- */
-inline int ANeuralNetworksExecution_setOutputFromMemory(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_setOutputFromMemory);
-  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
-}
-
-/**
- * Schedule evaluation of the execution.
- *
- * <p>Schedules evaluation of the execution. Once the model has been
- * applied and the outputs are ready to be consumed, the execution will be
- * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that signal.
- * </p>
- *
- * Multiple executions can be scheduled and evaluated concurrently, and
- * compilations can be performed concurrently with executions. The runtime makes
- * no guarantee on the ordering of the completion of compilations and
- * executions. If it's important to the application, the application should
- * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
- * {@link ANeuralNetworksExecution_wait}.
- *
- * ANeuralNetworksExecution_wait must be called to recuperate the resources used
- * by the execution.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @param execution The execution to be scheduled and executed.
- *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
- */
-inline int ANeuralNetworksExecution_startCompute(
-    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
-  LOAD_FUNCTION(ANeuralNetworksExecution_startCompute);
-  EXECUTE_FUNCTION_RETURN(execution, event);
-}
-
-/**
- * Waits until the execution completes.
- *
- * More than one thread can wait on an event. When the execution completes,
- * all threads will be released.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- *
- * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
- */
-inline int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
-  LOAD_FUNCTION(ANeuralNetworksEvent_wait);
-  EXECUTE_FUNCTION_RETURN(event);
-}
-
-/**
- * Destroys the event.
- *
- * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- */
-inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
-  LOAD_FUNCTION(ANeuralNetworksEvent_free);
-  EXECUTE_FUNCTION(event);
-}
-
-#endif  // __NEURAL_NETWORKS_SHIM__
diff --git a/runtime/libs/nnapi/v1.2/CMakeLists.txt b/runtime/libs/nnapi/v1.2/CMakeLists.txt
deleted file mode 100644
index 21ec3015f..000000000
--- a/runtime/libs/nnapi/v1.2/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_library(nnfw_lib_nnapi_1_2 INTERFACE)
-
-target_include_directories(nnfw_lib_nnapi_1_2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(nnfw_lib_nnapi_1_2 INTERFACE nnfw-nnapi-header)
diff --git a/runtime/nnapi-header/include/NeuralNetworks.h b/runtime/nnapi-header/include/NeuralNetworks.h
index 7400806d8..0c54d7582 100644
--- a/runtime/nnapi-header/include/NeuralNetworks.h
+++ b/runtime/nnapi-header/include/NeuralNetworks.h
@@ -24,8 +24,8 @@
  * @file NeuralNetworks.h
  */
 
-#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
-#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H
 
 /******************************************************************
  *
@@ -43,16 +43,14 @@
  *   - DO NOT CHANGE THE LAYOUT OR SIZE OF STRUCTURES
  */
 
-// For compatibility with android, check __ANDROID_API__ is defined
-// If __ANDROID_API__ is pre-defined, this header may be used for android
-#ifndef __ANDROID_API__
-#define __ANDROID_API__ 29
-#define __ANDROID_API_Q__ 29
+// For compatibility with android, check __ANDROID__ is defined
+#ifndef __ANDROID__
+#define __ANDROID_API__ 30
 #define __INTRODUCED_IN(api_level)
 typedef struct AHardwareBuffer AHardwareBuffer;
 #else
 #include <android/hardware_buffer.h>
-#endif // __ANDROID_API__
+#endif // __ANDROID__
 #include <stddef.h>
 #include <stdint.h>
 #include <sys/cdefs.h>
@@ -62,7 +60,11 @@ __BEGIN_DECLS
 /**
  * Operand types.
  *
- * The type of operands that can be added to a model.
+ * The type of an operand in a model.
+ *
+ * Types prefaced with ANEURALNETWORKS_TENSOR_* must be used for tensor data (i.e., tensors
+ * with at least one dimension). Types not prefaced by ANEURALNETWORKS_TENSOR_* represent
+ * scalar values and must have no dimensions.
  *
  * Although we define many types, most operators accept just a few
  * types. Most used are {@link ANEURALNETWORKS_TENSOR_FLOAT32},
@@ -94,7 +96,6 @@ typedef enum {
      *   real_value = (integer_value - zeroPoint) * scale.
      */
     ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
-#if __ANDROID_API__ >= __ANDROID_API_Q__
     /**
      * An 8 bit boolean scalar value.
      *
@@ -160,7 +161,6 @@ typedef enum {
      * Available since API level 29.
      */
     ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
-
     /**
      * A tensor of 16 bit unsigned integers that represent real numbers.
      *
@@ -175,7 +175,6 @@ typedef enum {
      * Available since API level 29.
      */
     ANEURALNETWORKS_TENSOR_QUANT16_ASYMM = 12,
-
     /**
      * A tensor of 8 bit signed integers that represent real numbers.
      *
@@ -188,14 +187,36 @@ typedef enum {
      * Available since API level 29.
      */
     ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+    /**
+     * A tensor of 8 bit signed integers that represent real numbers.
+     *
+     * Attached to this tensor are two numbers that can be used to convert the
+     * 8 bit integer to the real value and vice versa. These two numbers are:
+     * - scale: a 32 bit floating point value greater than zero.
+     * - zeroPoint: a 32 bit integer, in range [-128, 127].
+     *
+     * The formula is:
+     * real_value = (integer_value - zeroPoint) * scale.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED = 14,
 
+    /**
+     * A reference to a model.
+     *
+     * {@link ANeuralNetworksModel_setOperandValueFromModel} must be used to set
+     * the value for an Operand of this type.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_MODEL = 15,
 } OperandCode;
 
 /**
  * Operation types.
  *
- * The type of operations that can be added to a model.
+ * The type of an operation in a model.
  *
  * Available since API level 27.
  */
@@ -231,6 +252,8 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -238,15 +261,19 @@ typedef enum {
      * * 0: A tensor.
      * * 1: A tensor of the same {@link OperandCode}, and compatible dimensions
      *      as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scales and zeroPoint can be different from input0 scale and zeroPoint.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: The sum, a tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 27.
@@ -270,18 +297,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -307,8 +336,8 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
      *      {@link PaddingCode} values.
@@ -330,7 +359,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -346,8 +376,9 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API
-     *   level 29, see the input section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *   (full support since API level 29, see the input section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -357,6 +388,9 @@ typedef enum {
      *            Before API level 29, all input tensors of
      *            {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
      *            must have the same scale and zeroPoint as the output tensor.
+     *            Input tensors of
+     *            {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *            are allowed to have different scale and zeroPoint.
      *            Since API level 29, zero-sized tensors are supported.
      * * n: An {@link ANEURALNETWORKS_INT32} scalar, specifying the
      *      concatenation axis.
@@ -373,7 +407,7 @@ typedef enum {
     ANEURALNETWORKS_CONCATENATION = 2,
 
     /**
-     * Performs an 2-D convolution operation.
+     * Performs a 2-D convolution operation.
      *
      * The CONV_2D op sweeps a 2-D filter that can mix channels together over a
      * batch of images, applying the filter to each window of each image of the
@@ -409,31 +443,46 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * Available since API level 30:
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
-     *      filter. For tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      filter.
+     *      For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
@@ -466,22 +515,25 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
-     *      filter. For tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      filter.
+     *      For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same
+     *      type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
@@ -509,10 +561,9 @@ typedef enum {
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
-     *      [batches, out_height, out_width, depth_out]. Before API level 29,
-     *      for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
-     *      the following condition must be satisfied:
-     *      output_scale > input_scale * filter_scale
+     *      [batches, out_height, out_width, depth_out].
+     *      Before API level 29, for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      the following condition must be satisfied: output_scale > input_scale * filter_scale
      *
      * Available since API level 27.
      */
@@ -559,10 +610,23 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * Available since API level 30:
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
@@ -570,18 +634,20 @@ typedef enum {
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
      *      specifying the input.
      * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out],
-     *      specifying the filter. For tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 3.
+     *      specifying the filter.
+     *      For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 3.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
@@ -620,14 +686,15 @@ typedef enum {
      * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out],
      *      specifying the filter.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
-     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *      or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
-     *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
      *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
@@ -654,12 +721,11 @@ typedef enum {
      *      cells between each filter element on height dimension. If this input is set,
      *      input 9 (dilation factor for width) must be specified as well.
      *      Available since API level 29.
-
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
-     *      [batches, out_height, out_width, depth_out]. Before API level 29,
-     *      for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      [batches, out_height, out_width, depth_out]. Before API level 29, for
+     *      output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the following condition must be satisfied:
      *      output_scale > input_scale * filter_scale
      *
@@ -686,11 +752,13 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
@@ -705,7 +773,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape [batch, height*block_size,
      *      width*block_size, depth/(block_size*block_size)].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -723,6 +792,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported output tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
@@ -731,7 +801,8 @@ typedef enum {
      * Supported tensor rank: up to 4
      *
      * Inputs:
-     * * 0: A tensor. Since API level 29, this tensor may be zero-sized.
+     * * 0: A tensor.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: A tensor with the same shape as input0.
@@ -761,9 +832,11 @@ typedef enum {
      * and an error must be reported.
      *
      * Supported value tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 30)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_INT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported value tensor rank: from 2
      *
@@ -777,7 +850,8 @@ typedef enum {
      * * 0: A n-D tensor with the same rank and shape as the Values
      *      tensor, except for the first dimension which has the same size
      *      as Lookups' only dimension.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input1.
      *
      * Available since API level 27.
@@ -816,6 +890,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
@@ -826,26 +901,26 @@ typedef enum {
      *      [batch_size, input_size], where "input_size" corresponds to the
      *      number of inputs to the layer, matching the second dimension of
      *      weights, and "batch_size" is calculated by dividing the number of
-     *      elements by "input_size". Since API level 29, zero batch_size is
-     *      supported for this tensor.
+     *      elements by "input_size".
+     *      Since API level 29, zero batch_size is supported for this tensor.
      * * 1: A 2-D tensor, specifying the weights, of shape
      *      [num_units, input_size], where "num_units" corresponds to the number
      *      of output nodes.
      * * 2: A 1-D tensor, of shape [num_units], specifying the bias. For input
      *      tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, the bias should
-     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input tensor
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be
-     *      of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
-     *      bias_scale == input_scale * filter_scale.
+     *      also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      with zeroPoint of 0 and bias_scale == input_scale * filter_scale.
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
      *
      * Outputs:
-     * * 0: The output tensor, of shape [batch_size, num_units]. Before API
-     *      level 29, for output tensor of {@link
-     *      ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the following condition must
-     *      be satisfied: output_scale > input_scale * filter_scale.
+     * * 0: The output tensor, of shape [batch_size, num_units]. Before API level 29, for
+     *      output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the following
+     *      condition must be satisfied: output_scale > input_scale * filter_scale.
      *
      * Available since API level 27.
      */
@@ -911,7 +986,7 @@ typedef enum {
     ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
 
     /**
-     * Applies L2 normalization along the depth dimension.
+     * Applies L2 normalization along the axis dimension.
      *
      * The values in the output tensor are computed as:
      *
@@ -919,13 +994,13 @@ typedef enum {
      *         input[batch, row, col, channel] /
      *         sqrt(sum_{c} pow(input[batch, row, col, c], 2))
      *
-     * For input tensor with rank less than 4, independently normalizes each
-     * 1-D slice along dimension dim.
+     * By default the axis dimension is the last dimension of the input tensor.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      * Tensors with rank less than 4 are only supported since API level 29.
@@ -942,6 +1017,12 @@ typedef enum {
      * * 0: A tensor of the same {@link OperandCode} and same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 128 and the zeroPoint must be 128.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 128 and the zeroPoint must be 0.
+     *
+     *      NOTE: Before API level 30, if the elements along an axis are all zeros,
+     *      the result is undefined. Since API level 30, if the elements along an axis
+     *      are all zeros, the result is logical zero.
      *
      * Available since API level 27.
      */
@@ -967,13 +1048,14 @@ typedef enum {
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -999,8 +1081,8 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
      *      {@link PaddingCode} values.
@@ -1095,17 +1177,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 256 and the zeroPoint must be 0.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 256 and the zeroPoint must be -128.
      *
      * Available since API level 27.
      */
@@ -1158,7 +1243,7 @@ typedef enum {
      * Outputs:
      * * 0: If the projection type is Sparse:
      *      Output.Dim == { Tensor[0].Dim[0] }
-     *      A tensor of int32 that represents hash signatures,
+     *      A tensor of int32 that represents hash signatures.
      *
      *      If the projection type is Dense:
      *      Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
@@ -1248,7 +1333,7 @@ typedef enum {
      * * The projection bias (\f$b_{proj}\f$) may (but not required to) have a
      *   value if the recurrent projection layer exists, and should otherwise
      *   have no value.
-     * * (API level >= 29) The four layer normalization weights either all have
+     * * (API level 29 or later) The four layer normalization weights either all have
      *   values or none of them have values. Additionally, if CIFG is used,
      *   input layer normalization weights tensor is omitted and the other layer
      *   normalization weights either all have values or none of them have
@@ -1406,18 +1491,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both explicit padding and implicit padding are supported.
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -1443,8 +1530,8 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
      *      padding scheme, has to be one of the
      *      {@link PaddingCode} values.
@@ -1466,7 +1553,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1496,6 +1584,8 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -1506,10 +1596,13 @@ typedef enum {
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: The product, a tensor of the same {@link OperandCode} as input0.
-     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the following condition must be satisfied:
      *      output_scale > input1_scale * input2_scale.
      *
@@ -1528,16 +1621,18 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1555,16 +1650,18 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of the same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1582,16 +1679,18 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1608,6 +1707,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
@@ -1624,7 +1724,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: The output tensor, of shape specified by the input shape.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1642,18 +1743,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Both resizing by shape and resizing by scale are supported.
      *
      * Inputs (resizing by shape):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
-     *      the input. Since API level 29, zero batches is supported for this
-     *      tensor.
+     *      the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the output
      *      width of the output tensor.
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the output
@@ -1661,6 +1764,17 @@ typedef enum {
      * * 3: An optional {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
      *      Available since API level 29.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Inputs (resizing by scale, since API level 29):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
@@ -1679,10 +1793,24 @@ typedef enum {
      *      {@link ANEURALNETWORKS_FLOAT32} otherwise.
      * * 3: An optional {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, new_height, new_width, depth].
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
+     *      the scale and zeroPoint must be the same as input0.
      *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
@@ -1762,19 +1890,21 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      * Tensors with rank other than 2 or 4 are only supported since API level 29.
      *
      * Inputs:
-     * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped. Since
-     *      API level 29, this tensor may be zero-sized.
+     * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped.
+     *      Since API level 29, this tensor may be zero-sized.
      * * 1: A scalar, specifying the positive scaling factor for the exponent,
-     *      beta. If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the scalar must be of
-     *      {@link ANEURALNETWORKS_FLOAT32}. If input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT16}, then the scalar must be of {@link
-     *      ANEURALNETWORKS_FLOAT16}.
+     *      beta. If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, the scalar
+     *      must be of {@link ANEURALNETWORKS_FLOAT32}.
+     *      If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16}, then the
+     *      scalar must be of {@link ANEURALNETWORKS_FLOAT16}.
      * * 2: An optional {@link ANEURALNETWORKS_INT32} scalar, default to -1,
      *      specifying the dimension the activation would be performed on.
      *      Negative index is used to specify axis from the end (e.g. -1 for
@@ -1785,6 +1915,8 @@ typedef enum {
      * * 0: The output tensor of same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 256 and the zeroPoint must be 0.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 256 and the zeroPoint must be -128.
      *
      * Available since API level 27.
      */
@@ -1808,11 +1940,13 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
@@ -1827,7 +1961,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape [batches, height/block_size,
      *      width/block_size, depth_in*block_size*block_size].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 27.
@@ -1924,17 +2059,20 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4.
      *
      * Inputs:
-     * * 0: A tensor, specifying the input. Since API level 29, this tensor may
-     *      be zero-sized.
+     * * 0: A tensor, specifying the input.
+     *      Since API level 29, this tensor may be zero-sized.
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0.
      *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
      *      the scale must be 1.f / 128 and the zeroPoint must be 128.
+     *      For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the scale must be 1.f / 128 and the zeroPoint must be 0.
      *
      * Available since API level 27.
      */
@@ -1942,7 +2080,6 @@ typedef enum {
 
     // Operations below are available since API level 28.
 
-    // TODO: make the description easier to understand.
     /**
      * BatchToSpace for N-dimensional tensors.
      *
@@ -1957,11 +2094,13 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: An n-D tensor, specifying the tensor to be reshaped
@@ -1974,7 +2113,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 28.
@@ -1988,6 +2128,11 @@ typedef enum {
      * dimensions. The output is the result of dividing the first input tensor
      * by the second, optionally modified by an activation function.
      *
+     * For inputs of {@link ANEURALNETWORKS_TENSOR_INT32}, performs
+     * "floor division" ("//" in Python). For example,
+     *     5 // 2 = 2
+     *    -5 // 2 = -3
+     *
      * Two dimensions are compatible when:
      *     1. they are equal, or
      *     2. one of them is 1
@@ -2008,6 +2153,7 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2018,6 +2164,8 @@ typedef enum {
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
@@ -2038,6 +2186,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2057,23 +2206,27 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
-     *      the scale and zeroPoint must be same as input0.
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
+     *      the scale and zeroPoint must be the same as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 28.
      */
     ANEURALNETWORKS_MEAN = 31,
 
     /**
-     * Pads a tensor with zeros.
+     * Pads a tensor.
      *
      * This operation pads a tensor according to the specified paddings.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API
-     *   level 29, see the output section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     *   (full support since API level 29, see the output section)
      *
      * Supported tensor rank: up to 4
      *
@@ -2095,7 +2248,8 @@ typedef enum {
      *      of the padding:
      *          output0.dimension[i] =
      *              padding[i, 0] + input0.dimension[i] + padding[i, 1]
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      *      NOTE: Before API level 29, the pad value for
@@ -2106,7 +2260,6 @@ typedef enum {
      */
     ANEURALNETWORKS_PAD = 32,
 
-    // TODO: make the description easier to understand.
     /**
      * SpaceToBatch for N-Dimensional tensors.
      *
@@ -2121,13 +2274,15 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API
-     *   level 29, see the output section)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     *   (full support since API level 29, see the output section)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
      * be NCHW, the data storage order of: [batch, channels, height, width].
+     * NCHW is supported since API level 29.
      *
      * Inputs:
      * * 0: An n-D tensor, specifying the input.
@@ -2148,7 +2303,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      *      NOTE: Before API level 29, the pad value for
@@ -2171,6 +2327,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2186,8 +2343,11 @@ typedef enum {
      * * 0: A tensor of the same {@link OperandCode} as input0. Contains the
      *      same data as input, but has one or more dimensions of size 1
      *      removed.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
+     *      If all input dimensions are equal to 1 and are to be squeezed, the
+     *      output shape is [1].
      *
      * Available since API level 28.
      */
@@ -2206,6 +2366,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2235,8 +2396,11 @@ typedef enum {
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0 and rank (n - k),
      *      where k is the number of bits set in shrink_axis_mask.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
+     *      If shrink_axis_mask is true for all input dimensions, the output
+     *      shape is [1].
      *
      * Available since API level 28.
      */
@@ -2270,6 +2434,8 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2280,10 +2446,13 @@ typedef enum {
      * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *      {@link FuseCode} values. Specifies the activation to
      *      invoke on the result.
+     *      For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor,
+     *      the {@link FuseCode} must be "NONE".
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 28.
@@ -2303,6 +2472,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2314,7 +2484,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 28.
@@ -2329,6 +2500,7 @@ typedef enum {
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30)
      *
      * Supported tensor rank: from 1.
      *
@@ -2350,6 +2522,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -2361,6 +2534,7 @@ typedef enum {
      *
      * Outputs:
      * * 0: An (n - 1)-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor.
+     *      If input is 1-dimensional, the output shape is [1].
      *
      * Available since API level 29.
      */
@@ -2376,6 +2550,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -2387,6 +2562,7 @@ typedef enum {
      *
      * Outputs:
      * * 0: An (n - 1)-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor.
+     *      If input is 1-dimensional, the output shape is [1].
      *
      * Available since API level 29.
      */
@@ -2419,7 +2595,8 @@ typedef enum {
      *      and height, dw and dh is the log-scale relative correction factor
      *      for the width and height. For input0 of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, this tensor should be
-     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}. Zero num_rois is
+     *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}. Zero num_rois is
      *      supported for this tensor.
      * * 2: An 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape
      *      [num_rois], specifying the batch index of each box. Boxes with
@@ -2441,7 +2618,54 @@ typedef enum {
     ANEURALNETWORKS_AXIS_ALIGNED_BBOX_TRANSFORM = 41,
 
     /**
-     * Performs a forward LSTM on the input followed by a backward LSTM.
+     * A recurrent neural network layer that applies an LSTM cell to a
+     * sequence of inputs in forward and backward directions.
+     *
+     * The op supports cross-linking via an auxiliary input. Regular cell feeds
+     * one input into the two RNN cells in the following way:
+     *
+     *       INPUT  (INPUT_REVERSED)
+     *         |         |
+     *    ---------------------
+     *    | FW_LSTM   BW_LSTM |
+     *    ---------------------
+     *         |         |
+     *      FW_OUT     BW_OUT
+     *
+     * An op with cross-linking takes two inputs and feeds them into the RNN
+     * cells in the following way:
+     *
+     *       AUX_INPUT   (AUX_INPUT_REVERSED)
+     *           |             |
+     *     INPUT | (INPUT_R'D.)|
+     *       |   |       |     |
+     *    -----------------------
+     *    |  \  /        \    / |
+     *    | FW_LSTM     BW_LSTM |
+     *    -----------------------
+     *         |           |
+     *      FW_OUT      BW_OUT
+     *
+     * The cross-linking mode is enabled iff auxiliary input and auxiliary
+     * weights are present. While stacking this op on top of itself, this
+     * allows to connect both forward and backward outputs from previous cell
+     * to the next cell's input.
+     *
+     * Since API level 30 parallel linking mode is supported. The mode is
+     * enabled if auxiliary input is present but auxiliary weights are omitted.
+     * In this case, the cell feeds inputs into the RNN in the following way:
+     *
+     *       INPUT (AUX_INPUT_REVERSED)
+     *         |         |
+     *    ---------------------
+     *    | FW_LSTM   BW_LSTM |
+     *    ---------------------
+     *         |         |
+     *      FW_OUT     BW_OUT
+     *
+     * While stacking this op on top of itself, this allows to connect both
+     * forward and backward outputs from previous cell to the next cell's
+     * corresponding inputs.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
@@ -2451,7 +2675,6 @@ typedef enum {
      *
      * All input and output tensors must be of the same type.
      *
-     *
      * Inputs:
      * * 0: The input.
      *      A 3-D tensor of shape:
@@ -2543,25 +2766,34 @@ typedef enum {
      * * 38: The backward input cell state.
      *       A 2-D tensor of shape [batch_size, bw_num_units].
      * * 39: The auxiliary input. Optional.
-     *       A 3-D tensor of shape [max_time, batch_size, input_size], where “batch_size”
-     *       corresponds to the batching dimension, and “input_size” is the size
-     *       of the input.
-     * * 40: The forward auxiliary input-to-input weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 41: The forward auxiliary input-to-forget weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 42: The forward auxiliary input-to-cell weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 43: The forward auxiliary input-to-output weights. Optional.
-     *       A 2-D tensor of shape [fw_num_units, input_size].
-     * * 44: The backward auxiliary input-to-input weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
-     * * 45: The backward auxiliary input-to-forget weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
-     * * 46: The backward auxiliary input-to-cell weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
-     * * 47: The backward auxiliary input-to-output weights. Optional.
-     *       A 2-D tensor of shape [bw_num_units, input_size].
+     *       A 3-D tensor of shape [max_time, batch_size, aux_input_size],
+     *       where “batch_size” corresponds to the batching dimension, and
+     *       “aux_input_size” is the size of the auxiliary input. Optional. See
+     *       the docs above for the usage modes explanation.
+     * * 40: The forward auxiliary input-to-input weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 41: The forward auxiliary input-to-forget weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 42: The forward auxiliary input-to-cell weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 43: The forward auxiliary input-to-output weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [fw_num_units, aux_input_size].
+     * * 44: The backward auxiliary input-to-input weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
+     * * 45: The backward auxiliary input-to-forget weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
+     * * 46: The backward auxiliary input-to-cell weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
+     * * 47: The backward auxiliary input-to-output weights.
+     *       Optional. See the docs above for the usage modes explanation.
+     *       A 2-D tensor of shape [bw_num_units, aux_input_size].
      * * 48: The activation function.
      *       A value indicating the activation function:
      *       <ul>
@@ -2576,17 +2808,17 @@ typedef enum {
      *       then clipping is disabled.
      *       If all the input tensors have type {@link ANEURALNETWORKS_TENSOR_FLOAT32},
      *       this scalar must be of the type {@link ANEURALNETWORKS_FLOAT32},
-     *       otherwise if all the input tensors have the type {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be of type {@link
-     *       ANEURALNETWORKS_FLOAT16}.
+     *       otherwise if all the input tensors have the type
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be
+     *       of type {@link ANEURALNETWORKS_FLOAT16}.
      * * 50: The clipping threshold for the output from the
      *       projection layer, such that values are bound within
      *       [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      *       If all the input tensors have type {@link ANEURALNETWORKS_TENSOR_FLOAT32},
      *       this scalar must be of the type {@link ANEURALNETWORKS_FLOAT32},
-     *       otherwise if all the input tensors have the type {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be of type {@link
-     *       ANEURALNETWORKS_FLOAT16}.
+     *       otherwise if all the input tensors have the type
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be
+     *       of type {@link ANEURALNETWORKS_FLOAT16}.
      * * 51: merge_outputs
      *       An {@link ANEURALNETWORKS_BOOL} scalar specifying if the outputs
      *       from forward and backward cells should be merged.
@@ -2633,8 +2865,36 @@ typedef enum {
      *      A 3-D tensor of shape:
      *        If time-major: [max_time, batch_size, bw_output_size]
      *        If batch-major: [batch_size, max_time, bw_output_size]
+     * * 2: The forward activation state output.
+     *      A 2-D tensor of shape [batch_size, fw_output_size] containing an
+     *      activation state from the last time step in the sequence. This
+     *      output is optional and can be omitted. If this output is present
+     *      then outputs 3-5 must be present as well.
+     *      Available since API level 30.
+     * * 3: The forward cell state output.
+     *      A tensor of shape [batch_size, fw_cell_size] containing a cell state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted. If this output is present
+     *      then outputs 2, 4, 5 must be present as well.
+     *      Available since API level 30.
+     * * 4: The backward activation state output.
+     *      A 2-D tensor of shape [batch_size, bw_output_size] containing an
+     *      activation state from the last time step in the sequence. This
+     *      output is optional and can be omitted. If this output is present
+     *      then outputs 2, 3, 5 must be present as well.
+     *      Available since API level 30.
+     * * 5: The backward cell state output.
+     *      A tensor of shape [batch_size, bw_cell_size] containing a cell state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted. If this output is present
+     *      then outputs 2-4 must be present as well.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_LSTM = 42,
 
@@ -2662,8 +2922,8 @@ typedef enum {
      * * “activation” is the function passed as the “fused_activation_function”
      *   argument (if not “NONE”).
      *
-     * The op also supports an auxiliary input. Regular cell feeds one input
-     * into the two RNN cells in the following way:
+     * The op supports cross-linking via an auxiliary input. Regular cell feeds
+     * one input into the two RNN cells in the following way:
      *
      *       INPUT  (INPUT_REVERSED)
      *         |         |
@@ -2673,8 +2933,8 @@ typedef enum {
      *         |         |
      *      FW_OUT     BW_OUT
      *
-     * An op with an auxiliary input takes two inputs and feeds them into the
-     * RNN cells in the following way:
+     * An op with cross-linking takes two inputs and feeds them into the RNN
+     * cells in the following way:
      *
      *       AUX_INPUT   (AUX_INPUT_REVERSED)
      *           |             |
@@ -2687,9 +2947,26 @@ typedef enum {
      *         |           |
      *      FW_OUT      BW_OUT
      *
+     * The cross-linking mode is enabled iff auxiliary input and auxiliary
+     * weights are present. While stacking this op on top of itself, this
+     * allows to connect both forward and backward outputs from previous cell
+     * to the next cell's input.
+     *
+     * Since API level 30 parallel linking mode is supported. The mode is
+     * enabled if auxiliary input is present but auxiliary weights are omitted.
+     * In this case, the cell feeds inputs into the RNN in the following way:
+     *
+     *       INPUT (AUX_INPUT_REVERSED)
+     *         |         |
+     *    ---------------------
+     *    | FW_RNN     BW_RNN |
+     *    ---------------------
+     *         |         |
+     *      FW_OUT     BW_OUT
+     *
      * While stacking this op on top of itself, this allows to connect both
      * forward and backward outputs from previous cell to the next cell's
-     * inputs.
+     * corresponding inputs.
      *
      * Supported tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
@@ -2722,11 +2999,17 @@ typedef enum {
      *      A 2-D tensor of shape [batchSize, bwNumUnits]. Specifies a hidden
      *      state input for the first time step of the computation.
      * * 9: auxInput.
-     *      A 3-D tensor. The shape is the same as of the input 0.
+     *      A 3-D tensor. The shape is defined by the input 6 (timeMajor). If
+     *      it is set to true, then the input has a shape [maxTime, batchSize,
+     *      auxInputSize], otherwise the input has a shape [batchSize, maxTime,
+     *      auxInputSize]. Can be omitted. See the docs above for the usage
+     *      modes explanation.
      * * 10:fwAuxWeights.
-     *      A 2-D tensor of shape [fwNumUnits, inputSize].
+     *      A 2-D tensor of shape [fwNumUnits, auxInputSize]. Can be omitted.
+     *      See the docs above for the usage modes explanation.
      * * 11:bwAuxWeights.
-     *      A 2-D tensor of shape [bwNumUnits, inputSize].
+     *      A 2-D tensor of shape [bwNumUnits, auxInputSize]. Can be omitted.
+     *      See the docs above for the usage modes explanation.
      * * 12:fusedActivationFunction.
      *      A {@link FuseCode} value indicating the activation function. If
      *      “NONE” is specified then it results in a linear activation.
@@ -2752,8 +3035,24 @@ typedef enum {
      *      (timeMajor). If it is set to true, then the shape is set to
      *      [maxTime, batchSize, bwNumUnits], otherwise the shape is set to
      *      [batchSize, maxTime, bwNumUnits].
+     * * 2: The forward hidden state output.
+     *      A 2-D tensor of shape [batchSize, fwNumUnits] containing a hidden
+     *      state from the last time step in the sequence. This output is
+     *      optional and can be omitted. If this output is present then output
+     *      3 must be present as well.
+     *      Available since API level 30.
+     * * 3: The backward hidden state output.
+     *      A 2-D tensor of shape [batchSize, bwNumUnits] containing a hidden
+     *      state from the last time step in the sequence. This output is
+     *      optional and can be omitted. If this output is present then output
+     *      2 must be present as well.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_RNN = 43,
 
@@ -2780,6 +3079,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Inputs:
      * * 0: A 2-D Tensor of shape [num_rois, num_classes], specifying the score
@@ -2791,7 +3091,11 @@ typedef enum {
      *      order of the boxes corresponds with input0. For input0 of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should be of
      *      {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with zeroPoint of 0 and
-     *      scale of 0.125. Zero num_rois is supported for this tensor.
+     *      scale of 0.125.
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      this tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM},
+     *      with zeroPoint of -128 and scale of 0.125.
+     *      Zero num_rois is supported for this tensor.
      * * 2: A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape
      *      [num_rois], specifying the batch index of each box. Boxes with
      *      the same batch index are grouped together.
@@ -2818,6 +3122,8 @@ typedef enum {
      *      [num_output_rois], specifying the score of each output box. The boxes
      *      are grouped by batches, but the sequential order in each batch is not
      *      guaranteed. For type of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      guaranteed. For type of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      or {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the scale and zero point must be the same as input0.
      * * 1: A 2-D Tensor of the same {@link OperandCode} as input1, with shape
      *      [num_output_rois, 4], specifying the coordinates of each
@@ -2837,7 +3143,7 @@ typedef enum {
     ANEURALNETWORKS_BOX_WITH_NMS_LIMIT = 44,
 
     /**
-     * Casts a tensor to a new type.
+     * Casts a tensor to a type.
      *
      * This operation ignores the scale and zeroPoint of quanized tensors,
      * e.g. it treats a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} input
@@ -2848,6 +3154,14 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * Since API level 30, casting tensors of the following
+     * {@link OperandCode} to the same {@link OperandCode} is supported:
+     * * {@link ANEURALNETWORKS_TENSOR_BOOL8}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
      *
      * Supported tensor rank: from 1
      *
@@ -2880,6 +3194,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -2894,7 +3209,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} and same shape as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -2952,14 +3268,14 @@ typedef enum {
      * * 11: A scalar, score_threshold. Boxes with scores lower than the
      *       threshold are filtered before sending to the NMS algorithm. The
      *       scalar must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of
-     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *       ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *       {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 12: A scalar, specifying the IoU threshold for hard NMS. The scalar
-     *       must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *       ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *       ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *       must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *       {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *       {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 13: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to include
      *       background class in the list of label map for the output, set
      *       to false to not include the background. When the background
@@ -2992,6 +3308,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3041,6 +3358,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3052,7 +3370,8 @@ typedef enum {
      * Outputs:
      * * 0: An (n + 1)-D tensor with the same {@link OperandCode} and data as
      *      input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -3078,6 +3397,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3092,7 +3412,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: An (n + k - 1)-D tensor with the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -3115,6 +3436,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Inputs:
      * * 0: A 4-D Tensor specifying the score of each anchor at each
@@ -3132,11 +3454,13 @@ typedef enum {
      *      dimensions is the channel dimension.
      * * 2: A 2-D Tensor of shape [num_anchors, 4], specifying the shape of each
      *      predefined anchor, with format [x1, y1, x2, y2]. For input0 of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should be of
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this tensor should be of
      *      {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}, with scale of 0.125.
      * * 3: A 2-D Tensor of shape [batches, 2], specifying the size of
      *      each image in the batch, with format [image_height, image_width].
-     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this
      *      tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}, with
      *      scale of 0.125.
      * * 4: An {@link ANEURALNETWORKS_FLOAT32} scalar, specifying the ratio
@@ -3163,7 +3487,8 @@ typedef enum {
      *      [num_output_rois], specifying the score of each output box.
      *      The boxes are grouped by batches, but the sequential order in
      *      each batch is not guaranteed. For type of
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the scale and zero
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, the scale and zero
      *      point must be the same as input0.
      * * 1: A tensor of the same {@link OperandCode} as input3, of shape
      *      [num_output_rois, 4], specifying the coordinates of each output
@@ -3188,6 +3513,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3213,6 +3539,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3271,12 +3598,23 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
      * * * input.scale * filter.scale).
      *
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
      * * Quantized with symmetric per channel quantization for the filter:
      * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} for input, and output.
      * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
@@ -3295,8 +3633,9 @@ typedef enum {
      *      {@link ANeuralNetworksSymmPerChannelQuantParams}) must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
      *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
      *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
@@ -3316,7 +3655,7 @@ typedef enum {
      * * 8: An {@link ANEURALNETWORKS_INT32} scalar, specifying the stride when
      *      walking through input in the ‘height’ dimension.
      * * 9: An {@link ANEURALNETWORKS_INT32} scalar, specifying the number of
-            groups.
+     *      groups.
      * * 10: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the
      *       {@link FuseCode} values. Specifies the activation to
      *       invoke on the result.
@@ -3330,12 +3669,14 @@ typedef enum {
      *      [depth_out, filter_height, filter_width, depth_group], specifying
      *      the filter, where depth_out must be divisible by num_groups.  For
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
-     *      the channel dimension (channelDim at
-     *      {@link ANeuralNetworksSymmPerChannelQuantParams}) must be set to 0.
+     *      the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim)
+     *      must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
      *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same
-     *      type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
      *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint
      *      of 0 and bias_scale == input_scale * filter_scale. For filter tensor
      *      of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
@@ -3360,7 +3701,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth_out].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -3382,6 +3724,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -3398,13 +3741,18 @@ typedef enum {
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should
      *      be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with zeroPoint
      *      of 0 and scale of 0.125.
+     *      For input0 of type
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this tensor
+     *      should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with
+     *      zeroPoint of -128 and scale of 0.125.
      * * 2: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to specify
      *      NCHW data layout for input0. Set to false for NHWC.
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0, with shape
      *      [num_boxes, num_keypoints], specifying score of the keypoints.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from input0 scale and zeroPoint.
      * * 1: A tensor of the same {@link OperandCode} as input1, with shape
      *      [num_boxes, num_keypoints, 2], specifying the location of
@@ -3447,19 +3795,19 @@ typedef enum {
      * * 0: An n-D tensor, specifying the tensor to be normalized.
      * * 1: A scalar, specifying gamma, the scale applied to the normalized
      *      tensor. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if
-     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *      ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *      {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 2: A scalar, specifying beta, the offset applied to the normalized
      *      tensor. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if
-     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *      ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *      {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 3: A scalar, specifying epsilon, the small value added to variance to
      *      avoid dividing by zero. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if
-     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link
-     *      ANEURALNETWORKS_FLOAT32} if input0 is of {@link
-     *      ANEURALNETWORKS_TENSOR_FLOAT32}.
+     *      input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of
+     *      {@link ANEURALNETWORKS_FLOAT32} if input0 is of
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
      * * 4: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to specify
      *      NCHW data layout for input0 and output0. Set to false for NHWC.
      *
@@ -3479,6 +3827,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3505,6 +3854,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3644,6 +3994,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1.
      *
@@ -3656,7 +4007,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -3671,6 +4023,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1.
      *
@@ -3683,7 +4036,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -3719,6 +4073,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3744,6 +4099,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -3761,7 +4117,8 @@ typedef enum {
      *      pad value must be of {@link ANEURALNETWORKS_FLOAT16}.
      *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, the
      *      pad value must be of {@link ANEURALNETWORKS_FLOAT32}.
-     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
      *      the pad value must be of {@link ANEURALNETWORKS_INT32}. The
      *      scale and zeroPoint are assumed to be the same as in input0.
      *
@@ -3773,7 +4130,8 @@ typedef enum {
      *      of the padding:
      *          output0.dimension[i] =
      *              padding[i, 0] + input0.dimension[i] + padding[i, 1]
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -3836,6 +4194,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -3846,8 +4205,9 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
-     *      the scale and zeroPoint can be diffent from the input0 scale and zeroPoint.
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
+     *      the scales and zeroPoint can be different from input0 scale and zeroPoint.
      *
      * Available since API level 29.
      */
@@ -3856,14 +4216,23 @@ typedef enum {
     /**
      * Quantizes the input tensor.
      *
-     * The formula is:
+     * The formula for {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} output tensor is:
      *
      *     output = max(0, min(255, round(input / scale) + zeroPoint)
      *
-     * Supported tensor {@link OperandCode}:
+     * The formula for {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} output
+     * tensor is:
+     *
+     *     output = max(-128, min(127, round(input / scale) + zeroPoint)
+     *
+     * Supported input tensor {@link OperandCode}:
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      *
+     * Supported output tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
+     *
      * Supported tensor rank: from 1
      *
      * Inputs:
@@ -3871,7 +4240,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: The output tensor of same shape as input0, but with
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}.
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or.
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}.
      *
      * Available since API level 29.
      */
@@ -3995,7 +4365,8 @@ typedef enum {
      * * 1: A scalar {@link ANEURALNETWORKS_INT32}, specifying the number of
      *      independent samples to draw for each row slice.
      * * 2: A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape [2],
-     *      specifying seeds used to initialize the random distribution.
+     *      specifying seeds used to initialize the random distribution. If both
+     *      provided seeds are 0, both will be randomly generated.
      * Outputs:
      * * 0: A 2-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape
      *      [batches, samples], containing the drawn samples.
@@ -4026,6 +4397,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4053,6 +4426,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4070,6 +4445,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -4082,7 +4458,10 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4101,6 +4480,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: up to 4
      *
@@ -4113,7 +4493,10 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4142,6 +4525,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4169,6 +4554,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0.
+     *      If all dimensions are reduced and keep_dims is false, the output
+     *      shape is [1].
      *
      * Available since API level 29.
      */
@@ -4188,9 +4575,10 @@ typedef enum {
      * interpolation.
      *
      * Supported tensor {@link OperandCode}:
-     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29)
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -4229,7 +4617,8 @@ typedef enum {
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0. The output
      *      shape is [num_rois, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from the input0 scale and zeroPoint.
      *
      * Available since API level 29.
@@ -4252,6 +4641,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -4262,7 +4652,8 @@ typedef enum {
      * * 0: A 4-D tensor, specifying the feature map.
      * * 1: A 2-D Tensor of shape [num_rois, 4], specifying the locations of
      *      the regions of interest, each line with format [x1, y1, x2, y2].
-     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM},
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      this tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM},
      *      with zeroPoint of 0 and scale of 0.125.
      * * 2: An 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape
@@ -4282,7 +4673,8 @@ typedef enum {
      * Outputs:
      * * 0: A tensor of the same {@link OperandCode} as input0. The output
      *      shape is [num_rois, out_height, out_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4319,6 +4711,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4329,7 +4722,8 @@ typedef enum {
      *      true) or input2 (if false).
      * * 1: An input tensor of the same shape as input0.
      * * 2: An input tensor of the same shape and type as input1.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scales and zeroPoint can be different from input1 scale and zeroPoint.
      *
      * Outputs:
@@ -4337,6 +4731,7 @@ typedef enum {
      *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
+     * Available since API level 29.
      */
     ANEURALNETWORKS_SELECT = 84,
 
@@ -4376,6 +4771,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4388,7 +4784,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: An n-D tensor of the same type as the input containing the slice.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      its scale and zeroPoint has to be same as the input0 scale and zeroPoint.
      *
      * Available since API level 29.
@@ -4403,6 +4800,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4415,7 +4813,8 @@ typedef enum {
      *
      * Outputs:
      * * 0 ~ (num_splits - 1): Resulting subtensors.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4455,6 +4854,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4465,7 +4865,8 @@ typedef enum {
      *
      * Outputs:
      * * 0: A tiled tensor of the same {@link OperandCode} and rank as `input`.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
@@ -4483,6 +4884,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_INT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: from 1
      *
@@ -4494,7 +4896,8 @@ typedef enum {
      * Outputs:
      * * 0: An n-D tensor of the same type as the input, containing the k
      *      largest elements along each last dimensional slice.
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      * * 1: An n-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32}
      *      containing the indices of values within the last dimension of input.
@@ -4531,6 +4934,18 @@ typedef enum {
      * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
      * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
      *
+     * Available since API level 30:
+     * * Quantized signed (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to
+     * * * input.scale * filter.scale).
+     *
+     * * Quantized signed with filter symmetric per channel quantization (since API level 30):
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output.
+     * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter.
+     * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0,
+     * * * each value scaling is separate and equal to input.scale * filter.scales[channel]).
+     *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
      * [batch, height, width, channels]. Alternatively, the data layout could
@@ -4540,24 +4955,25 @@ typedef enum {
      *
      * Inputs (explicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
      *      filter. For tensor of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
-     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias should be of the
-     *      same type. For input tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be
-     *      of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
-     *      bias_scale == input_scale * filter_scale. For filter tensor of
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal
-     *      to bias_scale[i] = input_scale * filter_scale[i].
+     *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the
+     *      same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      with zeroPoint of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
      *      the left, in the ‘width’ dimension.
      * * 4: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on
@@ -4578,24 +4994,25 @@ typedef enum {
      *
      * Inputs (implicit padding):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth_in],
-     *      specifying the input. Since API level 29, zero batches is supported
-     *      for this tensor.
+     *      specifying the input.
+     *      Since API level 29, zero batches is supported for this tensor.
      * * 1: A 4-D tensor, of shape
      *      [depth_out, filter_height, filter_width, depth_in], specifying the
      *      filter. For tensor of type
      *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel
-     *      dimension (extraParams.channelQuant.channelDim) must be set to 0.
+     *      dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) must be set to 0.
      * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input
      *      tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or
      *      {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias should be of the
-     *      same type. For input tensor of type
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be
-     *      of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and
-     *      bias_scale == input_scale * filter_scale. For filter tensor of
-     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias
-     *      must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of
-     *      0 and bias_scale of 0. The actual scale of each value 'i' is equal
-     *      to bias_scale[i] = input_scale * filter_scale[i].
+     *      same type.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     *      and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED},
+     *      the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      with zeroPoint of 0 and bias_scale == input_scale * filter_scale.
+     *      For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL},
+     *      the bias must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0
+     *      and bias_scale of 0. The actual scale of each value 'i' is equal to
+     *      bias_scale[i] = input_scale * filter_scale[i].
      * * 3: An {@link ANEURALNETWORKS_TENSOR_INT32} tensor, specifying the output
      *      tensor shape.
      * * 4: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit
@@ -4614,7 +5031,8 @@ typedef enum {
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, out_height, out_width, depth_out].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint can be different from inputs' scale and zeroPoint.
      *
      * Available since API level 29.
@@ -4727,8 +5145,21 @@ typedef enum {
      *      A 3-D tensor of shape:
      *        If time-major: [max_time, batch_size, output_size]
      *        If batch-major: [batch_size, max_time, output_size]
+     * * 1: A tensor of shape [batch_size, output_size] containing a hidden
+     *      state from the last time step in the sequence. This output is
+     *      optional and can be omitted. If this output is present then
+     *      output #2 must be present as well.
+     *      Available since API level 30.
+     * * 2: A tensor of shape [batch_size, cell_size] containing a cell state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
 
@@ -4784,8 +5215,16 @@ typedef enum {
      *      it is set to 1, then the output has a shape [maxTime, batchSize,
      *      numUnits], otherwise the output has a shape [batchSize, maxTime,
      *      numUnits].
+     * * 1: A tensor of shape [batchSize, numUnits] containing hidden state
+     *      from the last time step in the sequence. This output is optional
+     *      and can be omitted.
+     *      Available since API level 30.
      *
      * Available since API level 29.
+     *
+     * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI
+     * does not maintain internal states. This operator does not support the usage pattern in which
+     * multiple cells are chained and state tensors are propagated.
      */
     ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
 
@@ -4800,6 +5239,7 @@ typedef enum {
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
      * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
      * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30)
      *
      * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout.
      * With the default data layout NHWC, the data is stored in the order of:
@@ -4817,6 +5257,17 @@ typedef enum {
      *      height of the output tensor.
      * * 3: An {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Inputs (resizing by scale):
      * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying
@@ -4835,16 +5286,377 @@ typedef enum {
      *      {@link ANEURALNETWORKS_FLOAT32} otherwise.
      * * 3: An {@link ANEURALNETWORKS_BOOL} scalar, default to false.
      *      Set to true to specify NCHW data layout for input0 and output0.
+     * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false.  If True, the centers of the 4 corner
+     *      pixels of the input and output tensors are aligned, preserving the
+     *      values at the corner pixels.
+     *      Available since API level 30.
+     * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL}
+     *      scalar, default to false. If True, the pixel centers are assumed to
+     *      be at (0.5, 0.5). This is the default behavior of image.resize in
+     *      TF 2.0. If this parameter is True, then align_corners parameter
+     *      must be False.
+     *      Available since API level 30.
      *
      * Outputs:
      * * 0: The output 4-D tensor, of shape
      *      [batches, new_height, new_width, depth].
-     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor,
+     *      For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and
+     *      {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor,
      *      the scale and zeroPoint must be the same as input0.
      *
      * Available since API level 29.
      */
     ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+
+    // Operations below are available since API level 30.
+
+    /**
+     * Quantized version of {@link ANEURALNETWORKS_LSTM}.
+     *
+     * The input and the output use asymmetric quantized types, while the rest
+     * use symmetric ones.
+     *
+     * Inputs:
+     * * 0: The input to the LSTM cell.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *      Shape: [batchSize, inputSize]
+     * * 1: The input-to-input weights. Optional.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 2: The input-to-forget weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 3: The input-to-cell weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 4: The input-to-output weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, inputSize]
+     * * 5: The recurrent-to-input weights. Optional.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 6: The recurrent-to-forget weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 7: The recurrent-to-cell weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 8: The recurrent-to-output weights.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *      Shape: [numUnits, outputSize]
+     * * 9: The cell-to-input weights (for peephole). Optional.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *      Shape: [numUnits]
+     * * 10: The cell-to-forget weights (for peephole). Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 11: The cell-to-output weights (for peephole). Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 12: The input gate bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 13: The forget gate bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 14: The cell bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 15: The output gate bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [numUnits]
+     * * 16: The projection weights. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     *       Shape: [outputSize, numUnits]
+     * * 17: The projection bias. Quantized with scale being the
+     *       product of input and weights scales and zeroPoint equal to 0.
+     *       Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_INT32}
+     *       Shape: [outputSize]
+     * * 18: The output from the previous time step.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *       Shape: [batchSize, outputSize]
+     * * 19: The cell state from the previous time step.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [batchSize, numUnits]
+     * * 20: The input layer normalization weights. Used to rescale
+     *       normalized inputs to activation at input gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 21: The forget layer normalization weights. Used to
+     *       rescale normalized inputs to activation at forget gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 22: The cell layer normalization weights. Used to rescale
+     *       normalized inputs to activation at cell gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 23: The output layer normalization weights. Used to
+     *       rescale normalized inputs to activation at output gate. Optional.
+     *       Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *       Shape: [numUnits]
+     * * 24: The cell clip. If provided the cell state is clipped
+     *       by this value prior to the cell output activation. Optional.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 25: The projection clip. If provided and projection is enabled,
+     *       this is used for clipping the projected values. Optional.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 26: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at input gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 27: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at forget gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 28: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at cell gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 29: The scale of the intermediate result of matmul,
+     *       i.e. input to layer normalization, at output gate.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     * * 30: The zero point of the hidden state, i.e. input to
+     *       projection.
+     *       Type: {@link ANEURALNETWORKS_INT32}.
+     * * 31: The scale of the hidden state, i.e. input to
+     *       projection.
+     *       Type: {@link ANEURALNETWORKS_FLOAT32}.
+     *
+     * Outputs:
+     * * 0: The output state (out).
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *      Shape: [batchSize, outputSize]
+     * * 1: The cell state (out).
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     *      Shape: [batchSize, numUnits]
+     * * 2: The output. This is effectively the same as the current
+     *      "output state (out)" value.
+     *      Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *      Shape: [batchSize, outputSize]
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+
+    /**
+     * Executes one of the two referenced models as determined by a boolean
+     * value.
+     *
+     * The inputs and outputs of the two referenced models must agree with the
+     * signature of this operation. That is, if the operation has (3 + n) inputs
+     * and m outputs, both models must have n inputs and m outputs with the same
+     * types, ranks (if specified), dimensions (if specified), scales,
+     * zeroPoints, and other operand parameters as the corresponding operation
+     * inputs and outputs.
+     *
+     * Inputs:
+     * * 0: A value of type {@link ANEURALNETWORKS_TENSOR_BOOL8} and shape [1]
+     *      that determines which of the two referenced models to execute.
+     *      The operand must have fully specified dimensions.
+     * * 1: A {@link ANEURALNETWORKS_MODEL} reference to the model to be
+     *      executed if the condition is true.
+     * * 2: A {@link ANEURALNETWORKS_MODEL} reference to the model to be
+     *      executed if the condition is false.
+     * * 3 ~ (n + 2): Inputs to be passed to the model selected for execution.
+     *
+     * Outputs:
+     * * 0 ~ (m - 1): Outputs produced by the selected model.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_IF = 96,
+
+    /**
+     * Executes the body model until the condition model outputs false.
+     *
+     * The inputs to this operation are the condition model, the body model,
+     * and operand values for the first iteration of the loop. The values are
+     * implicitly split into three groups of input-output, state-only, and
+     * input-only values, as described below.
+     *
+     * The outputs of this operation are the final values of input-output
+     * operands.
+     *
+     * Both the condition and body model receive (m + k + n) inputs.
+     * * The first m (m >= 1) inputs are input-output operands. For the first
+     *   iteration, these are initialized from the corresponding inputs of the
+     *   WHILE operation. In subsequent iterations, their values come from the
+     *   corresponding outputs of the body model produced during the previous
+     *   iteration.
+     * * The next k (k >= 0) inputs are state-only operands. They are similar to
+     *   the input-output operands, except that their values are no longer
+     *   available after the loop terminates.
+     * * The last n (n >= 0) inputs are input-only operands. Their values come
+     *   from the corresponding inputs of the WHILE operation.
+     *
+     * The body model produces (m + k) outputs.
+     * * The first m outputs are input-output operands. They become the outputs
+     *   of the WHILE operation when a termination condition is reached.
+     * * The last k outputs are state-only operands. Their values are no longer
+     *   available after the loop terminates.
+     *
+     * The numbers m, k, and n are inferred by the runtime as follows:
+     *     m = (WHILE operation output count)
+     *     k = (body model output count) - m
+     *     n = (body model input count) - m - k
+     *
+     * The pseudo-code below illustrates the flow of a WHILE operation with
+     * inputs condition, body, initial_input_output, initial_state, input_only
+     * (m = 1, k = 1, n = 1):
+     *
+     *     input_output = initial_input_output
+     *     state = initial_state
+     *     while condition(input_output, state, input_only):
+     *         input_output, state = body(input_output, state, input_only)
+     *     return input_output
+     *
+     * To prevent infinite loops, there is an implicit execution timeout
+     * associated with each loop ("loop timeout duration"). See {@link
+     * ANeuralNetworksExecution_setLoopTimeout}.
+     *
+     * Inputs:
+     * * 0: A {@link ANEURALNETWORKS_MODEL} reference to the condition
+     *      model. The model must have (m + k + n) inputs with
+     *      the same types, ranks (if specified), dimensions (if specified),
+     *      scales, zeroPoints, and other operand parameters as the
+     *      corresponding inputs of the WHILE operation and exactly one output
+     *      of {@link ANEURALNETWORKS_TENSOR_BOOL8} and shape [1].
+     *      The output operand must have fully specified dimensions.
+     * * 1: A {@link ANEURALNETWORKS_MODEL} reference to the body model.
+     *      The model must have (m + k + n) inputs and (m + k) outputs with
+     *      the same types, ranks (if specified), dimensions (if specified),
+     *      scales, zeroPoints, and other operand parameters as the
+     *      corresponding inputs and outputs of the WHILE operation.
+     * * (m inputs): Initial values for input-output operands.
+     * * (k inputs): Initial values for state-only operands.
+     * * (n inputs): Values for input-only operands.
+     *
+     * Outputs:
+     * * 0 ~ (m - 1): Outputs produced by the loop.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_WHILE = 97,
+
+    /**
+     * Computes exponential linear activation on the input tensor element-wise.
+     *
+     * The output is calculated using the following formula:
+     *
+     *     ELU(x) = max(0, x) + min(0, alpha * (exp(x) - 1))
+     *
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input. May be zero-sized.
+     * * 1: A scalar, specifying the alpha parameter.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT16},
+     *      the alpha value must be of {@link ANEURALNETWORKS_FLOAT16}.
+     *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+     *      the alpha value must be of {@link ANEURALNETWORKS_FLOAT32}.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape and type as input0.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_ELU = 98,
+
+    /**
+     * Computes hard-swish activation on the input tensor element-wise.
+     *
+     * Hard swish activation is introduced in
+     * https://arxiv.org/pdf/1905.02244.pdf
+     *
+     * The output is calculated using the following formula:
+     *
+     *     h-swish(x) = x * max(0, min(6, (x + 3))) / 6
+
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: A tensor, specifying the input. May be zero-sized.
+     *
+     * Outputs:
+     * * 0: The output tensor of same shape and type as input0.
+     *      Scale and zero point of this tensor may be different from the input
+     *      tensor's parameters.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_HARD_SWISH = 99,
+
+    /**
+     * Creates a tensor filled with a scalar value.
+     *
+     * Supported output tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: A 1-D tensor, specifying the desired output tensor shape.
+     * * 1: A scalar, specifying the value to fill the output tensors with.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT16},
+     *      the scalar must be of {@link ANEURALNETWORKS_FLOAT16}.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32},
+     *      the scalar must be of {@link ANEURALNETWORKS_FLOAT32}.
+     *      For output tensor of {@link ANEURALNETWORKS_TENSOR_INT32},
+     *      the scalar must be of {@link ANEURALNETWORKS_INT32}.
+     *
+     * Outputs:
+     * * 0: The output tensor.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_FILL = 100,
+
+    /**
+     * Returns the rank of a tensor.
+     *
+     * The rank of a tensor is the number of dimensions in it. Also known as
+     * "order", "degree", "ndims".
+     *
+     * Supported tensor {@link OperandCode}:
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT16}
+     * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+     * * {@link ANEURALNETWORKS_TENSOR_INT32}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_BOOL8}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM}
+     * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}
+     *
+     * Supported tensor rank: from 1.
+     *
+     * Inputs:
+     * * 0: The input tensor.
+     *
+     * Outputs:
+     * * 0: A scalar of {@link ANEURALNETWORKS_INT32}, specifying the rank
+     *      of the input tensor.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_RANK = 101,
 } OperationCode;
 
 /**
@@ -4880,10 +5692,11 @@ typedef enum {
      * the same; for odd number of padding, padding to the ending is bigger
      * than the padding to the beginning by 1.
      *
-     * total_padding is a function of input, stride and filter size.
+     * total_padding is a function of input, stride, dilation and filter size.
      * It could be computed as follows:
-     *    out_size = (input + stride - 1) / stride;
-     *    needed_input = (out_size - 1) * stride + filter_size
+     *    out_size = (input + stride - 1) / stride
+     *    effective_filter_size = (filter_size - 1) * dilation + 1
+     *    needed_input = (out_size - 1) * stride + effective_filter_size
      *    total_padding = max(0, needed_input - input_size)
      *  The computation is the same for the horizontal and vertical directions.
      */
@@ -5004,6 +5817,47 @@ typedef enum {
      * Failure caused by a device not being available.
      */
     ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
+
+    /**
+     * Failure because a deadline could not be met for a task, but future
+     * deadlines may still be met for the same task after a short delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT = 10,
+
+    /**
+     * Failure because a deadline could not be met for a task, and future
+     * deadlines will likely also not be met for the same task even after a
+     * short delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT = 11,
+
+    /**
+     * Failure because of a resource limitation within the driver, but future
+     * calls for the same task may still succeed after a short delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT = 12,
+
+    /**
+     * Failure because of a resource limitation within the driver, and future
+     * calls for the same task will likely also fail even after a short
+     * delay.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT = 13,
+
+    /**
+     * Failure indicating an object is in a dead state.
+     *
+     * Available since API level 30.
+     */
+    ANEURALNETWORKS_DEAD_OBJECT = 14,
 } ResultCode;
 
 /**
@@ -5024,6 +5878,48 @@ enum { ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 };
 enum { ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN = 32 };
 
 /**
+ * Different duration measurements.
+ *
+ * Durations are measured in nanoseconds.
+ *
+ * Available since API level 29.
+ */
+typedef enum {
+    // Execution time on hardware (not driver, which runs on host processor).
+    ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
+    // Execution time in driver (including time on hardware).  Excludes overhead
+    // such as that of the runtime itself and the IPC needed for the runtime to
+    // communicate with the driver.
+    ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
+    // Execution time on hardware, after all dependencies have been signaled.
+    // If no dependencies specified (for example, if the execution was scheduled other
+    // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}), the
+    // reported time will be the same as ANEURALNETWORKS_DURATION_ON_HARDWARE.
+    // Available since API level 30.
+    ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE = 2,
+    // Execution time in driver, after all dependencies have been signaled. Excludes
+    // overhead such as that of the runtime itself and the IPC needed for the runtime
+    // to communicate with the driver.
+    // If no dependencies specified (for example, if the execution was scheduled other
+    // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}), the
+    // reported time will be the same as ANEURALNETWORKS_DURATION_IN_DRIVER.
+    // Available since API level 30.
+    ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER = 3,
+} DurationCode;
+
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+typedef enum {
+    ANEURALNETWORKS_PRIORITY_LOW = 90,
+    ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+    ANEURALNETWORKS_PRIORITY_HIGH = 110,
+    ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+} PriorityCode;
+
+/**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
  * This type is used to represent shared memory, memory mapped files,
@@ -5049,7 +5945,21 @@ enum { ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN = 32 };
  * of the element type byte size, e.g., a tensor with
  * {@link ANEURALNETWORKS_TENSOR_FLOAT32} type must be aligned on 4-byte boundary.
  *
+ * It is the application's responsibility to ensure that there are no uses of
+ * the memory after calling {@link ANeuralNetworksMemory_free}. This includes
+ * any model which references this memory because of a call to
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}, any compilation
+ * created using such a model, any execution object or burst object created
+ * using such a compilation, or any execution which references this memory
+ * because of a call to {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ *
  * Available since API level 27.
+ *
+ * Starting at API level 30, the application may request creation of device native memory from
+ * {@link ANeuralNetworksMemoryDesc} to avoid potential memory copying and transformation
+ * overhead between executions. See also {@link ANeuralNetworksMemoryDesc} and
+ * {@link ANeuralNetworksMemory_createFromDesc}.
  */
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
 
@@ -5079,9 +5989,10 @@ typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
  * modifies a model at a given time. It is however safe for more than one
  * thread to use the model once {@link ANeuralNetworksModel_finish} has returned.</p>
  *
- * <p>It is also the application's responsibility to ensure that there are no other
- * uses of the model after calling {@link ANeuralNetworksModel_free}.
- * This includes any compilation or execution object created using the model.</p>
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the model after calling {@link ANeuralNetworksModel_free}.
+ * This includes any compilation, execution object or burst object created using
+ * the model.</p>
  *
  * Available since API level 27.
  */
@@ -5119,7 +6030,10 @@ typedef struct ANeuralNetworksModel ANeuralNetworksModel;
  *
  * <p>It is also the application's responsibility to ensure that there are no other
  * uses of the compilation after calling {@link ANeuralNetworksCompilation_free}.
- * This includes any execution object created using the compilation.</p>
+ * This includes any execution object or burst object created using the compilation,
+ * or any memory descriptor with the compilation as part of one of the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} or
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}.</p>
  *
  * Available since API level 27.
  */
@@ -5139,7 +6053,8 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
  *        {@link ANeuralNetworksExecution_setOutput} or
  *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
  *    <li>Apply the model with one of the following:</li><ul>
- *        <li>Asynchronously with {@link ANeuralNetworksExecution_startCompute},
+ *        <li>Asynchronously with {@link ANeuralNetworksExecution_startCompute}
+ *            or with {@link ANeuralNetworksExecution_startComputeWithDependencies},
  *            waiting for the execution to complete with
  *            {@link ANeuralNetworksEvent_wait}.</li>
  *        <li>Synchronously with {@link ANeuralNetworksExecution_compute}.</li>
@@ -5154,38 +6069,54 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
  * ({@link ANeuralNetworksModel_setOperandValueFromMemory}).</p>
  *
  * <p>An execution cannot be modified once
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} has been called on it.</p>
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} has been called on it.</p>
  *
  * <p>An execution can be applied to a model with
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} only once. Create new
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} only once. Create new
  * executions to do new evaluations of the model.</p>
  *
  * <p>It is the application's responsibility to make sure that only one thread
  * modifies an execution at a given time. It is however safe for more than one
  * thread to use {@link ANeuralNetworksEvent_wait} at the same time.</p>
  *
+ * <p>It is also the application's responsibility to ensure that the execution
+ * either has never been scheduled or has completed (i.e., that
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute}, or
+ * {@link ANeuralNetworksEvent_wait} has returned) before calling
+ * {@link ANeuralNetworksExecution_free}.</p>.
+ *
  * <p>It is also the application's responsibility to ensure that there are no other
  * uses of the execution after calling {@link ANeuralNetworksExecution_free}.</p>
  *
  * <p>Multiple executions can be scheduled and evaluated concurrently, either by
- * means of {@link ANeuralNetworksExecution_compute} (which is synchronous) in
- * different threads or by means of
- * {@link ANeuralNetworksExecution_startCompute} (which is asynchronous). The
- * runtime makes no guarantee on the ordering of completion of executions. If
- * it's important to the application, the application should enforce the
- * ordering by ensuring that one execution completes before the next is
- * scheduled (for example, by scheduling all executions synchronously within a
- * single thread, or by scheduling all executions asynchronously and using
- * {@link ANeuralNetworksEvent_wait} between calls to
- * {@link ANeuralNetworksExecution_startCompute}).</p>
+ * means of {@link ANeuralNetworksExecution_compute} or
+ * {@link ANeuralNetworksExecution_burstCompute} (which are synchronous) in
+ * different threads, or by means of
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} (which are asynchronous).
+ * (Concurrent uses of {@link ANeuralNetworksExecution_burstCompute} must be on
+ * different burst objects.) The runtime makes no guarantee on the ordering of
+ * completion of executions. If it's important to the application, the
+ * application should enforce the ordering by ensuring that one execution
+ * completes before the next is scheduled (for example, by scheduling all
+ * executions synchronously within a single thread, or by scheduling all
+ * executions asynchronously and using {@link ANeuralNetworksEvent_wait} between
+ * calls to {@link ANeuralNetworksExecution_startCompute}); or by using
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} to make the execution wait for a
+ * list of events to be signaled before starting the actual evaluation.</p>
  *
  * Available since API level 27.
  */
 typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 /**
  * Parameters for ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
  */
@@ -5230,7 +6161,7 @@ typedef struct ANeuralNetworksSymmPerChannelQuantParams {
  * Available since API level 29.
  */
 typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
-#endif  //  __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  //  __ANDROID_API__ >= 29
 
 /**
  * ANeuralNetworksOperandType describes the type of an operand.
@@ -5245,7 +6176,9 @@ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
  *
  * If a tensor operand's type is not fully specified, the dimensions
  * of the operand are deduced from the operand types and values of the
- * operation for which that operand is an output.
+ * operation for which that operand is an output or from the corresponding
+ * {@link ANEURALNETWORKS_IF} or {@link ANEURALNETWORKS_WHILE} operation input
+ * operand type in the case of referenced model input operands.
  *
  * <p>In the following situations, a tensor operand type must be fully
  * specified:<ul>
@@ -5254,16 +6187,25 @@ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
  *         non-nullptr buffer) or
  *         {@link ANeuralNetworksModel_setOperandValueFromMemory}.</li>
  *     <li>The operand is a model input (see
- *         {@link ANeuralNetworksModel_identifyInputsAndOutputs}).  A
- *         fully specified tensor operand type must either be provided
- *         to {@link ANeuralNetworksModel_addOperand}; or it must be
- *         provided to the corresponding
+ *         {@link ANeuralNetworksModel_identifyInputsAndOutputs}) of the main
+ *         model within a compilation.  A fully specified tensor operand type
+ *         must either be provided to {@link ANeuralNetworksModel_addOperand};
+ *         or it must be provided to the corresponding
  *         {@link ANeuralNetworksExecution_setInput}, or
  *         {@link ANeuralNetworksExecution_setInputFromMemory}.
  *         EXCEPTION: If the input is optional and omitted
  *         (by passing nullptr for buffer to
  *         {@link ANeuralNetworksExecution_setInput}) then it need
- *         not have a fully specified tensor operand type.</li></ul>
+ *         not have a fully specified tensor operand type.</li>
+ *     <li>The operand is a model output (see
+ *         {@link ANeuralNetworksModel_identifyInputsAndOutputs}) of the main
+ *         model within a compilation and is to be used with {@link
+ *         ANeuralNetworksExecution_startComputeWithDependencies}.
+ *         A fully specified tensor operand type must either be provided
+ *         to {@link ANeuralNetworksModel_addOperand}; or it must be
+ *         provided to the corresponding
+ *         {@link ANeuralNetworksExecution_setOutput}, or
+ *         {@link ANeuralNetworksExecution_setOutputFromMemory}.</li></ul>
  *
  * A tensor operand type of specified rank but some number of
  * unspecified dimensions is represented by setting dimensionCount to
@@ -5296,11 +6238,21 @@ typedef struct ANeuralNetworksOperandType {
     const uint32_t* dimensions;
 
     /**
-     * These two fields are only used for quantized tensors.
-     * They must be zero for all other types.
-     * The dequantized value of each entry is (value - zeroPoint) * scale.
+     * The quantization scale.
+     *
+     * Must be 0 when not applicable to an operand type.
+     *
+     * See {@link OperandCode}.
      */
     float scale;
+
+    /**
+     * The quantization zero point.
+     *
+     * Must be 0 when not applicable to an operand type.
+     *
+     * See {@link OperandCode}.
+     */
     int32_t zeroPoint;
 } ANeuralNetworksOperandType;
 
@@ -5314,7 +6266,7 @@ typedef int32_t ANeuralNetworksOperationType;
  */
 typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 
 /**
  * ANeuralNetworksDevice is an opaque type that represents a device.
@@ -5326,6 +6278,318 @@ typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
  */
 typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
 
+#endif  // __ANDROID_API__ >= 29
+
+#if __ANDROID_API__ >= 30
+
+/**
+ * ANeuralNetworksMemoryDesc is an opaque type that represents a memory descriptor.
+ *
+ * A memory descriptor describes the properties of a memory object, and is used by
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * To use:
+ *   - Create a new memory descriptor by calling {@link ANeuralNetworksMemoryDesc_create}.
+ *   - Specify all of the intended input and output roles by calling
+ *     {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ *     {@link ANeuralNetworksMemoryDesc_addOutputRole}.
+ *   - Optionally, specify the memory dimensions by calling
+ *     {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *   - Complete the memory descriptor with {@link ANeuralNetworksMemoryDesc_finish}.
+ *   - Use the memory descriptor as many times as needed with
+ *     {@link ANeuralNetworksMemory_createFromDesc}.
+ *   - Destroy the memory descriptor with {@link ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor is completed by calling {@link ANeuralNetworksMemoryDesc_finish}.
+ * A memory descriptor is destroyed by calling {@link ANeuralNetworksMemoryDesc_free}.
+ *
+ * A memory descriptor must not be modified once {@link ANeuralNetworksMemoryDesc_finish}
+ * has been called on it.
+ *
+ * It is the application's responsibility to make sure that only
+ * one thread modifies a memory descriptor at a given time. It is however
+ * safe for more than one thread to use the memory descriptor once
+ * {@link ANeuralNetworksMemoryDesc_finish} has returned.
+ *
+ * It is also the application's responsibility to ensure that there are no other
+ * uses of the memory descriptor after calling {@link ANeuralNetworksMemoryDesc_free}.
+ * It is however safe to continue using a {@link ANeuralNetworksMemory} object created
+ * from the memory descriptor.
+ *
+ * Available since API level 30.
+ */
+typedef struct ANeuralNetworksMemoryDesc ANeuralNetworksMemoryDesc;
+
+/**
+ * Create a {@link ANeuralNetworksMemoryDesc} with no properties.
+ *
+ * This only creates the memory descriptor. Its properties should be set with calls to
+ * {@link ANeuralNetworksMemoryDesc_addInputRole},
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and
+ * {@link ANeuralNetworksMemoryDesc_setDimensions}.
+ *
+ * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties have been set.
+ *
+ * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory descriptor
+ * is no longer needed.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The {@link ANeuralNetworksMemoryDesc} to be created.
+ *             Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_create(ANeuralNetworksMemoryDesc** desc) __INTRODUCED_IN(30);
+
+/**
+ * Destroy a memory descriptor.
+ *
+ * The memory descriptor need not have been finished by a call to
+ * {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be destroyed. Passing NULL is acceptable and
+ *             results in no operation.
+ */
+void ANeuralNetworksMemoryDesc_free(ANeuralNetworksMemoryDesc* desc) __INTRODUCED_IN(30);
+
+/**
+ * Specify that a memory object will be playing the role of an input to an execution created from a
+ * particular compilation.
+ *
+ * The compilation and the input index fully specify an input operand. This function
+ * may be invoked multiple times on the same memory descriptor with different input operands,
+ * and the same input operand may be specified on multiple memory descriptors. However,
+ * specifying the same input operand on the same memory descriptor more than once will
+ * return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each other. Two
+ * dimensions are incompatible if both ranks are fully specified but have different values, or if
+ * there is at least one axis that is fully specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory descriptor
+ * before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished by calling
+ *                    {@link ANeuralNetworksCompilation_finish}, and must outlive the memory
+ *                    descriptor.
+ * @param index The index of the input argument we are referencing from the compilation. It is
+ *              an index into the inputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0]. Describes how likely the
+ *                  memory is to be used in the specified role. This is provided as a hint to
+ *                  optimize the case when different roles prefer different memory locations or data
+ *                  layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_addInputRole(ANeuralNetworksMemoryDesc* desc,
+                                           const ANeuralNetworksCompilation* compilation,
+                                           uint32_t index, float frequency) __INTRODUCED_IN(30);
+
+/**
+ * Specify that a memory object will be playing the role of an output to an execution created from a
+ * particular compilation.
+ *
+ * The compilation and the output index fully specify an output operand. This function
+ * may be invoked multiple times on the same memory descriptor with different output operands,
+ * and the same output operand may be specified on multiple memory descriptors. However,
+ * specifying the same output operand on the same memory descriptor object more than once will
+ * return an error.
+ *
+ * The dimensions of the corresponding model operands of all the roles specified by
+ * {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each other. Two
+ * dimensions are incompatible if both ranks are fully specified but have different values, or if
+ * there is at least one axis that is fully specified in both but has different values.
+ *
+ * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the memory descriptor
+ * before invoking {@link ANeuralNetworksMemoryDesc_finish}.
+ *
+ * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param compilation The compilation object. It must already have been finished by calling
+ *                    {@link ANeuralNetworksCompilation_finish}, and must outlive the memory
+ *                    descriptor.
+ * @param index The index of the output argument we are referencing from the compilation. It is
+ *              an index into the outputs list passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link ANeuralNetworksModel_addOperand}.
+ * @param frequency A floating-point value within the range (0.0, 1.0]. Describes how likely the
+ *                  memory is to be used in the specified role. This is provided as a hint to
+ *                  optimize the case when multiple roles prefer different memory locations or data
+ *                  layouts.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_addOutputRole(ANeuralNetworksMemoryDesc* desc,
+                                            const ANeuralNetworksCompilation* compilation,
+                                            uint32_t index, float frequency) __INTRODUCED_IN(30);
+
+/**
+ * Set the dimensional information of the memory descriptor.
+ *
+ * The specified dimensions must be compatible with the dimensions of the corresponding model
+ * operands of all the roles specified by {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are incompatible if both ranks
+ * are fully specified but have different values, or if there is at least one axis that is fully
+ * specified in both but has different values.
+ *
+ * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been
+ * called will return an error.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be modified.
+ * @param rank The number of dimensions. Must be 0 for scalars.
+ * @param dimensions An array of dimensions. An entry with the value 0 indicates that the
+ *                   corresponding axis has an unknown size.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_setDimensions(ANeuralNetworksMemoryDesc* desc, uint32_t rank,
+                                            const uint32_t* dimensions) __INTRODUCED_IN(30);
+
+/**
+ * Indicate that we have finished modifying a memory descriptor. Required before calling
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
+ * This function must only be called once for a given memory descriptor.
+ *
+ * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemoryDesc_finish(ANeuralNetworksMemoryDesc* desc) __INTRODUCED_IN(30);
+
+/**
+ * Creates a memory object from a memory descriptor.
+ *
+ * The memory object is created with an uninitialized buffer. A memory object with an uninitialized
+ * buffer may only be used according to the roles specified by {@link
+ * ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination memory in {@link
+ * ANeuralNetworksMemory_copy}. The buffer of a memory object is initialized after the memory object
+ * is used as an output in a successful execution, or used as the destination memory in a successful
+ * {@link ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may be used
+ * according to all roles specified in {@link ANeuralNetworksMemoryDesc}, or as the source or
+ * destination memory in {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will
+ * return to the uninitialized state if the memory object is used as an output in a failed
+ * execution, or used as the destination memory in a failed {@link ANeuralNetworksMemory_copy}.
+ *
+ * The dimensions of the memory descriptor are deduced from the dimensions of the corresponding
+ * model operands of all the roles specified by {@link ANeuralNetworksMemoryDesc_addInputRole} and
+ * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions set by the call to
+ * {@link ANeuralNetworksMemoryDesc_setDimensions}, if any. The memory descriptor may have
+ * unspecified dimensions or rank. In such a case, the same memory object may be used with different
+ * shapes of outputs in different executions. When the memory is used as an input, the input shape
+ * must be the same as the output shape from the last execution using this memory object as an
+ * output, or the last {@link ANeuralNetworkMemory_copy} using this memory object as the destination
+ * memory. Creating a memory object with unspecified dimensions or rank may fail for certain sets of
+ * roles.
+ *
+ * Using the memory in roles or shapes that are not compatible with the rules specified above will
+ * return an error.
+ *
+ * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory object,
+ * both offset and length must be set to zero and the entire memory region will be
+ * associated with the specified input or output operand.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the memory created from this
+ * function will return an error.
+ *
+ * {@link ANeuralNetworksMemory_free} must be called once the memory is no longer needed.
+ *
+ * Attempting to create memory from an unfinished memory descriptor will return an error.
+ *
+ * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the {@link ANeuralNetworksMemory}
+ * object.
+ *
+ * Available since API level 30.
+ *
+ * @param desc The memory descriptor.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED if the memory is
+ *         created with unspecified dimensions or rank and it is not supported for this set of
+ *         roles.
+ */
+int ANeuralNetworksMemory_createFromDesc(const ANeuralNetworksMemoryDesc* desc,
+                                         ANeuralNetworksMemory** memory) __INTRODUCED_IN(30);
+
+/**
+ * Copies data from one memory object to another.
+ *
+ * If at most one of the src and dst is created from {@link ANeuralNetworksMemory_createFromDesc},
+ * the src and dst must have the same logical size:
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd}, or if it is created
+ *   from {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with format of
+ *   AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the memory.
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a
+ *   format other than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size when there is
+ *   no padding and the data is tightly packed. This function may fail if the AHardwareBuffer
+ *   cannot be accessed.
+ * - If the memory is created from {@link ANeuralNetworksMemory_createFromDesc}, the logical size
+ *   equals the size indicated by the {@link OperandCode} multiplied by the number of elements. This
+ *   function will fail if the number of elements is unknown.
+ *
+ * If both src and dst are created from {@link ANeuralNetworksMemory_createFromDesc}, they must have
+ * compatible dimensions. Two dimensions are incompatible if both ranks are fully specified but
+ * have different values, or if there is at least one axis that is fully specified in both but has
+ * different values. The dst may have unspecified dimensions or rank. In such a case, the dimensions
+ * of dst will get updated according to the dimensions of the src.
+ *
+ * In both cases, if the src is created from {@link ANeuralNetworksMemory_createFromDesc}, it must
+ * have been used as an output in a successful execution, or used as the destination memory in a
+ * successful {@link ANeuralNetworksMemory_copy}.
+ *
+ * The src and dst may have different data layout, in which case the data copying is performed
+ * logically with data layout transformation.
+ *
+ * Available since API level 30.
+ *
+ * @param src The source memory object.
+ * @param dst The destination memory object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksMemory_copy(const ANeuralNetworksMemory* src, const ANeuralNetworksMemory* dst)
+        __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
+#if __ANDROID_API__ >= 29
+
 /**
  * Get the number of available devices.
  *
@@ -5359,7 +6623,8 @@ int ANeuralNetworks_getDevice(uint32_t devIndex, ANeuralNetworksDevice** device)
  * @param device The representation of the specified device.
  * @param name   The returned name of the specified device. The name will be in UTF-8
  *               and will be null-terminated. It will be recognizable as a known device name
- *               rather than a cryptic string. For devices with feature level 29 and above, the
+ *               rather than a cryptic string. For devices with feature level reported by
+ *               {@link ANeuralNetworksDevice_getFeatureLevel} that is 29 and above, the
  *               format of the name is {VENDOR}-{DEVICE}. For devices with feature level 28
  *               or lower, the format of the name is undefined.
  *               The name will remain valid for the duration of the application.
@@ -5439,6 +6704,26 @@ int ANeuralNetworksDevice_getVersion(const ANeuralNetworksDevice* device, const
 int ANeuralNetworksDevice_getFeatureLevel(const ANeuralNetworksDevice* device,
                                           int64_t* featureLevel) __INTRODUCED_IN(29);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Wait until the device is in a live state.
+ *
+ * A device may encounter internal errors and temporarily enter a dead state. A
+ * call that uses a device in such a state will return with the error
+ * {@link ANEURALNETWORKS_DEAD_OBJECT}. ANeuralNetworksDevice_wait will block until
+ * the device is in a live state.
+ *
+ * @param device The representation of the specified device.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksDevice_wait(const ANeuralNetworksDevice* device) __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Get the supported operations for a specified set of devices. If multiple devices
  * are selected, the supported operation list is a union of supported operations of all
@@ -5473,6 +6758,10 @@ int ANeuralNetworksModel_getSupportedOperationsForDevices(
  * ANeuralNetworksCompilation_create}, where the runtime will attempt to recover
  * from such failures.
  *
+ * The model passed to this function is termed the "main model" of the
+ * compilation, to distinguish it from other models referred to by an Operand
+ * of type {@link ANEURALNETWORKS_MODEL} within this compilation.
+ *
  * @param model The {@link ANeuralNetworksModel} to be compiled.
  * @param devices The set of devices. Must not contain duplicates.
  * @param numDevices The number of devices in the set.
@@ -5502,7 +6791,7 @@ int ANeuralNetworksCompilation_createForDevices(ANeuralNetworksModel* model,
  *                 data. It is recommended to use the code cache directory provided
  *                 by the Android runtime. If not using the code cache directory, the
  *                 user should choose a directory local to the application, and is
- *                 responsible to managing the cache entries.
+ *                 responsible for managing the cache entries.
  * @param token The token provided by the user to specify a model must be of length
  *              ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that
  *              the token is unique to a model within the application. The NNAPI
@@ -5525,10 +6814,24 @@ int ANeuralNetworksCompilation_setCaching(ANeuralNetworksCompilation* compilatio
  * execution has completed and the outputs are ready to be consumed.
  * </p>
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on this execution,
+ * and the execution is not able to complete before the timeout duration is
+ * exceeded, then execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned. If the device has
+ * a feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel}
+ * that is lower than 30, then the timeout duration hint will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned.
+ *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
- * See {@link ANeuralNetworksExecution_startCompute} for asynchronous execution.
- * Synchronous execution incurs lower overhead than asynchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution.
+ * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution.
+ * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for
+ * asynchronous execution with dependencies.
  *
  * Available since API level 29.
  *
@@ -5544,9 +6847,10 @@ int ANeuralNetworksExecution_compute(ANeuralNetworksExecution* execution) __INTR
  * Get the dimensional information of the specified output operand of the model of the
  * {@link ANeuralNetworksExecution}.
  *
- * On asynchronous execution initiated by {@link ANeuralNetworksExecution_startCompute},
- * {@link ANeuralNetworksEvent_wait} must be called prior to this function to recuperate
- * the resources used by the execution.
+ * The execution must have completed.  On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function.
  *
  * @param execution The execution to be queried.
  * @param index The index of the output argument we are querying. It is
@@ -5569,9 +6873,10 @@ int ANeuralNetworksExecution_getOutputOperandRank(ANeuralNetworksExecution* exec
  * Get the dimensional information of the specified output operand of the model of the
  * {@link ANeuralNetworksExecution}. The target output operand cannot be a scalar.
  *
- * On asynchronous execution initiated by {@link ANeuralNetworksExecution_startCompute},
- * {@link ANeuralNetworksEvent_wait} must be called prior to this function to recuperate
- * the resources used by the execution.
+ * The execution must have completed.  On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function.
  *
  * @param execution The execution to be queried.
  * @param index The index of the output argument we are querying. It is an index into the lists
@@ -5625,11 +6930,28 @@ void ANeuralNetworksBurst_free(ANeuralNetworksBurst* burst) __INTRODUCED_IN(29);
  * <p>Schedules synchronous evaluation of the execution. Returns once the
  * execution has completed and the outputs are ready to be consumed.</p>
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on the execution,
+ * and the execution is not able to complete before the timeout duration is
+ * exceeded, then execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned.
+ *
+ * If the execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned. If the device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the
+ * timeout duration hint will be ignored.
+ *
  * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
  * any given time for any given burst object. Any
  * {@link ANeuralNetworksExecution} launched before the previous has finished
  * will result in ANEURALNETWORKS_BAD_STATE.</p>
  *
+ * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
+ * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution.
+ * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for
+ * asynchronous execution with dependencies.
+ *
  * Available since API level 29.
  *
  * @param burst The burst object to execute on.
@@ -5656,14 +6978,14 @@ int ANeuralNetworksExecution_burstCompute(ANeuralNetworksExecution* execution,
  * offset and length must be set to zero and the entire memory region will be
  * associated with the specified input or output operand. There is no guarantee
  * that an arbitrary AHardwareBuffer_Format and AHardwareBuffer_UsageFlags combination
- * can be used by arbitrary devices. The execution will fail if selected set of devices
- * cannot consume the buffer.
+ * can be used by arbitrary devices. The execution will fail if the selected set of
+ * devices cannot consume the buffer.
  *
  * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with shared memory
  * backed by an AHardwareBuffer of a format other than AHARDWAREBUFFER_FORMAT_BLOB is
  * disallowed.
  *
- * TODO(miaowang): add documentation about intended usage with introspection API.
+ * The provided AHardwareBuffer must outlive the ANeuralNetworksMemory object.
  *
  * Available since API level 29.
  *
@@ -5686,8 +7008,12 @@ int ANeuralNetworksMemory_createFromAHardwareBuffer(const AHardwareBuffer* ahwb,
  *
  * By default, duration is not measured.
  *
- * The {@link ANeuralNetworksExecution} must have been created with
+ * The {@link ANeuralNetworksExecution} must have been created from an
+ * {@link ANeuralNetworksCompilation} which in turn was created from
  * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+ * If the device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 29, then the
+ * duration will not be measured.
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
@@ -5702,41 +7028,32 @@ int ANeuralNetworksExecution_setMeasureTiming(ANeuralNetworksExecution* executio
         __INTRODUCED_IN(29);
 
 /**
- * Different duration measurements.
- *
- * Durations are measured in nanoseconds.
- *
- * Available since API level 29.
- */
-typedef enum {
-    // Execution time on hardware (not driver, which runs on host processor).
-    ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
-    // Execution time in driver (including time on hardware).  Excludes overhead
-    // such as that of the runtime itself and the IPC needed for the runtime to
-    // communicate with the driver.
-    ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
-} DurationCode;
-
-/**
  * Get the time spent in the specified {@link ANeuralNetworksExecution}, in nanoseconds.
- * The execution must have completed.
  *
- * Available since API level 29.
+ * The execution must have completed.  On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function.
  *
  * @param execution The execution to be queried.
  * @param durationCode The measurement to be queried, specified by {@link DurationCode}.
  * @param duration The returned duration. If no measurement was requested by
- *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for some other
- *                 reason the duration is not available, UINT64_MAX will be returned.
- *                 A particular device need not support any given measurement.
+ *                 {@link ANeuralNetworksExecution_setMeasureTiming}, if the
+ *                 device is has a feature level reported by
+ *                 {@link ANeuralNetworksDevice_getFeatureLevel} that is lower
+ *                 than 29, or for some other reason the duration is not
+ *                 available, UINT64_MAX will be returned. A particular device
+ *                 need not support any given measurement.
  *
  * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
  */
 int ANeuralNetworksExecution_getDuration(const ANeuralNetworksExecution* execution,
                                          int32_t durationCode, uint64_t* duration)
         __INTRODUCED_IN(29);
 
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  // __ANDROID_API__ >= 29
 
 #if __ANDROID_API__ >= 27
 
@@ -5776,7 +7093,8 @@ int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd, size_t
  *
  * Available since API level 27.
  *
- * @param memory The memory object to be freed.
+ * @param memory The memory object to be freed. Passing NULL is acceptable and
+ *               results in no operation.
  */
 void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) __INTRODUCED_IN(27);
 
@@ -5784,8 +7102,10 @@ void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) __INTRODUCED_IN(2
  * Create an empty {@link ANeuralNetworksModel}.
  *
  * <p>This only creates the object. Computation is performed once
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} is invoked.
  *
  * The model should be constructed with calls to
  * {@link ANeuralNetworksModel_addOperation} and
@@ -5826,8 +7146,8 @@ void ANeuralNetworksModel_free(ANeuralNetworksModel* model) __INTRODUCED_IN(27);
  * calling {@link ANeuralNetworksCompilation_create} and
  * {@link ANeuralNetworksCompilation_createForDevices}.
  *
- * An application is responsible to make sure that no other thread uses
- * the model at the same time.
+ * An application must ensure that no other thread uses the model at the same
+ * time.
  *
  * This function must only be called once for a given model.
  *
@@ -5901,11 +7221,13 @@ int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
  * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}
  * are immediately copied into the model.
  *
- * For values of length greater than {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES},
- * a pointer to the buffer is stored within the model. The application is responsible
- * for not changing the content of this region until all executions using this model
- * have completed. As the data may be copied during processing, modifying the data
- * after this call yields undefined results.
+ * For values of length greater than
+ * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}, a pointer to
+ * the buffer is stored within the model. The application must not change the
+ * content of this region until all executions using this model have
+ * completed. As the data may be copied during processing, modifying the data
+ * after this call yields undefined results. The provided buffer must outlive
+ * this model.
  *
  * For large tensors, using {@link ANeuralNetworksModel_setOperandValueFromMemory}
  * is likely to be more efficient.
@@ -5930,7 +7252,7 @@ int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
 int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index,
                                          const void* buffer, size_t length) __INTRODUCED_IN(27);
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 
 /**
  * Sets an operand's per channel quantization parameters.
@@ -5955,28 +7277,33 @@ int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
         ANeuralNetworksModel* model, int32_t index,
         const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) __INTRODUCED_IN(29);
 
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  // __ANDROID_API__ >= 29
 
 /**
  * Sets an operand to a value stored in a memory object.
  *
  * The content of the memory is not copied. A reference to that memory is stored
- * inside the model. The application is responsible for not changing the content
- * of the memory region until all executions using this model have completed.
- * As the data may be copied during processing, modifying the data after this call
- * yields undefined results.
+ * inside the model. The application must not change the content of the memory
+ * region until all executions using this model have completed.  As the data may
+ * be copied during processing, modifying the data after this call yields
+ * undefined results.
+ *
+ * <p>The provided memory must outlive this model.</p>
  *
  * To indicate that an optional operand should be considered missing,
  * use {@link ANeuralNetworksModel_setOperandValue} instead, passing nullptr for buffer.
  *
- * Is disallowed to set an operand value with shared memory backed by an AHardwareBuffer
+ * It is disallowed to set an operand value with shared memory backed by an AHardwareBuffer
  * of a format other than AHARDWAREBUFFER_FORMAT_BLOB.
  *
+ * It is disallowed to set an operand value with memory created from
+ * {@link ANeuralNetworksMemory_createFromDesc}.
+ *
  * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been
  * called will return an error.
  *
  * See {@link ANeuralNetworksModel} for information on multithreaded usage.
- * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on
+ * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on
  * AHardwareBuffer usage.
  *
  * Available since API level 27.
@@ -5996,6 +7323,39 @@ int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model,
                                                    size_t offset, size_t length)
         __INTRODUCED_IN(27);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Sets an operand to a value that is a reference to another NNAPI model.
+ *
+ * The referenced model must already have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * The {@link ANeuralNetworksModel_relaxComputationFloat32toFloat16} setting of
+ * referenced models is overridden by that setting of the main model of a
+ * compilation.
+ *
+ * The referenced model must outlive the model referring to it.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param value The model to be referenced.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksModel_setOperandValueFromModel(ANeuralNetworksModel* model, int32_t index,
+                                                  const ANeuralNetworksModel* value)
+        __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Add an operation to a model.
  *
@@ -6060,6 +7420,9 @@ int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, u
  * must be calculated using at least the range and precision of the IEEE 754
  * 32-bit floating-point format.
  *
+ * The relaxComputationFloat32toFloat16 setting of the main model of
+ * a compilation overrides the values of the referenced models.
+ *
  * @param model The model to be modified.
  * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
  *              calculated with range and/or precision as low as that of the
@@ -6083,7 +7446,11 @@ int ANeuralNetworksModel_relaxComputationFloat32toFloat16(ANeuralNetworksModel*
 /**
  * Create a {@link ANeuralNetworksCompilation} to compile the given model.
  *
- * <p>This only creates the object. Compilation is only performed once
+ * The model passed to this function is termed the "main model" of the
+ * compilation, to distinguish it from other models referred to by an Operand
+ * of type {@link ANEURALNETWORKS_MODEL} within this compilation.
+ *
+ * <p>This function only creates the object. Compilation is only performed once
  * {@link ANeuralNetworksCompilation_finish} is invoked.</p>
  *
  * <p>{@link ANeuralNetworksCompilation_finish} should be called once
@@ -6114,7 +7481,7 @@ int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model,
  * Destroy a compilation.
  *
  * The compilation need not have been finished by a call to
- * {@link ANeuralNetworksModel_finish}.
+ * {@link ANeuralNetworksCompilation_finish}.
  *
  * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
  *
@@ -6128,7 +7495,8 @@ void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation) __
 /**
  * Sets the execution preference.
  *
- * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+ * <p>Provides guidance to the runtime when trade-offs are possible. By default the runtime
+ * uses PREFER_SINGLE_FAST_ANSWER</p>
  *
  * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
  *
@@ -6146,13 +7514,19 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compila
 
 /**
  * Indicate that we have finished modifying a compilation. Required before
- * calling {@link ANeuralNetworksExecution_create}.
+ * calling {@link ANeuralNetworksBurst_create} or
+ * {@link ANeuralNetworksExecution_create}.
  *
- * An application is responsible to make sure that no other thread uses
- * the compilation at the same time.
+ * An application must ensure that no other thread uses the compilation at the
+ * same time.
  *
  * This function must only be called once for a given compilation.
  *
+ * If {@link ANeuralNetworksCompilation_setTimeout} was called on this
+ * compilation, and the compilation is not able to be finished before the
+ * timeout duration is exceeded, then compilation may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned.
+ *
  * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
  *
  * Available since API level 27.
@@ -6163,11 +7537,85 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compila
  */
 int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation) __INTRODUCED_IN(27);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Set the execution priority.
+ *
+ * Execution priorities are relative to other executions created by the same
+ * application (specifically same uid) for the same device. Specifically,
+ * priorities of executions from one application will not affect executions from
+ * another application. Similarly, priorities of executions on one device will
+ * not affect executions on another device.
+ *
+ * Higher priority executions may use more compute resources than lower priority
+ * executions, and may preempt or starve lower priority executions.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * Available since API level 30.
+ *
+ * @param compilation The compilation to be modified.
+ * @param priority The relative priority of the execution compared to other
+ *     executions created by the application. Must be one of
+ *     ANEURALNETWORKS_PRIORITY_*.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+int ANeuralNetworksCompilation_setPriority(ANeuralNetworksCompilation* compilation, int priority)
+        __INTRODUCED_IN(30);
+
+/**
+ * Set the maximum expected duration for compiling the model.
+ *
+ * If the device is not able to complete the compilation within the specified
+ * duration, the compilation may be aborted. The timeout duration begins at the
+ * call to {@link ANeuralNetworksCompilation_finish}.
+ *
+ * This timeout duration acts as a hint to drivers, and can be used to both free
+ * up compute resources within the driver and return control back to the
+ * application quicker than is possible without the hint. It enables drivers
+ * that are able to estimate how long a compilation will take to abort the
+ * compilation before it has even started if the driver believes the compilation
+ * cannot be completed within the timeout duration. Similarly, it enables
+ * drivers to abort an ongoing compilation if it is taking too long. However,
+ * this call does not guarantee that the compilation will complete or abort
+ * within the timeout duration.
+ *
+ * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+ * the timeout duration for compiling the model is considered infinite.
+ *
+ * The {@link ANeuralNetworksCompilation} must have been created with
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+ * device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the
+ * timeout duration hint will be ignored.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *     be spent finishing a compilation. If this duration is exceeded, the
+ *     compilation may be aborted. If set to 0, the timeout duration is
+ *     considered infinite.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksCompilation_setTimeout(ANeuralNetworksCompilation* compilation,
+                                          uint64_t duration) __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
  * This only creates the object. Computation is only performed once
- * {@link ANeuralNetworksExecution_compute} or
- * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} is invoked.
  *
  * <p>The provided compilation must outlive the execution.</p>
  *
@@ -6187,12 +7635,16 @@ int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation,
 /**
  * Destroy an execution.
  *
- * <p>If called on an execution for which
- * {@link ANeuralNetworksExecution_startCompute} has been called, the
- * function will return immediately but will mark the execution to be deleted
- * once the computation completes. The related {@link ANeuralNetworksEvent}
- * will be signaled and the {@link ANeuralNetworksEvent_wait} will return
- * ANEURALNETWORKS_ERROR_DELETED.
+ * <p>The execution need not have been scheduled by a call to
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute},
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies}; but if it has been scheduled,
+ * then the application must not call {@link ANeuralNetworksExecution_free}
+ * until the execution has completed (i.e.,
+ * {@link ANeuralNetworksExecution_burstCompute},
+ * {@link ANeuralNetworksExecution_compute}, or
+ * {@link ANeuralNetworksEvent_wait} has returned).
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
@@ -6206,7 +7658,10 @@ void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) __INTROD
 /**
  * Associate a user buffer with an input of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the buffer until the execution has
+ * completed. Evaluation of the execution will not change the content of the
+ * buffer.
  *
  * <p>The provided buffer must outlive the execution.</p>
  *
@@ -6244,9 +7699,12 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32
                                       size_t length) __INTRODUCED_IN(27);
 
 /**
- * Associate part of a memory object with an input of the model of the
+ * Associate a region of a memory object with an input of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the region until the execution has
+ * completed. Evaluation of the execution will not change the content of the
+ * region.
  *
  * <p>The provided memory must outlive the execution.</p>
  *
@@ -6255,8 +7713,10 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32
  * buffer and 0 for length.
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on
+ * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on
  * AHardwareBuffer usage.
+ * See {@link ANeuralNetworksMemory_createFromDesc} for information on usage of memory objects
+ * created from memory descriptors.
  *
  * Available since API level 27.
  *
@@ -6290,7 +7750,9 @@ int ANeuralNetworksExecution_setInputFromMemory(ANeuralNetworksExecution* execut
 /**
  * Associate a user buffer with an output of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the buffer until the execution has
+ * completed.
  *
  * If the output is optional, you can indicate that it is omitted by
  * passing nullptr for buffer and 0 for length.
@@ -6333,9 +7795,11 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int3
                                        size_t length) __INTRODUCED_IN(27);
 
 /**
- * Associate part of a memory object with an output of the model of the
+ * Associate a region of a memory object with an output of the model of the
  * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have
- * been scheduled.
+ * been scheduled. Once evaluation of the execution has been scheduled, the
+ * application must not change the content of the region until the execution has
+ * completed.
  *
  * If the output is optional, you can indicate that it is omitted by
  * using {@link ANeuralNetworksExecution_setOutput} instead, passing nullptr for
@@ -6344,8 +7808,10 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int3
  * <p>The provided memory must outlive the execution.</p>
  *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
- * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on
+ * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on
  * AHardwareBuffer usage.
+ * See {@link ANeuralNetworksMemory_createFromDesc} for information on usage of memory objects
+ * created from memory descriptors.
  *
  * Available since API level 27.
  *
@@ -6385,8 +7851,8 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu
 /**
  * Schedule asynchronous evaluation of the execution.
  *
- * <p>Schedules asynchronous evaluation of the execution. Once the model has
- * been applied and the outputs are ready to be consumed, the returned event
+ * <p>Schedules asynchronous evaluation of the execution. Once the execution
+ * has completed and the outputs are ready to be consumed, the returned event
  * will be signaled. Use {@link ANeuralNetworksEvent_wait} to wait for that
  * event.
  * </p>
@@ -6394,10 +7860,31 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu
  * ANeuralNetworksEvent_wait must be called to recuperate the resources used
  * by the execution.
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on this execution,
+ * and the execution is not able to complete before the timeout duration is
+ * exceeded, then execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned through
+ * {@link ANeuralNetworksExecution_startCompute} or
+ * {@link ANeuralNetworksEvent_wait} on the event object. If the device has a
+ * feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that
+ * is lower than 30, then the timeout duration hint will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned through {@link ANeuralNetworksEvent_wait} on the event
+ * object.
+ *
+ * If the device can detect before the execution has started that the execution
+ * will not complete within the timeout duration, the device may choose to skip
+ * the execution and instead return {@link ANEURALNETWORKS_MISSED_DEADLINE_*}.
+ *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
  * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
- * Synchronous execution incurs lower overhead than asynchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution.
+ * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for
+ * asynchronous execution with dependencies.
  *
  * Available since API level 27.
  *
@@ -6405,21 +7892,129 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu
  * @param event The event that will be signaled on completion. event is set to
  *              NULL if there's an error.
  *
- * @return ANEURALNETWORKS_NO_ERROR if successful.
+ * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled.
  */
 int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution,
                                           ANeuralNetworksEvent** event) __INTRODUCED_IN(27);
 
+#if __ANDROID_API__ >= 30
+
+/**
+ * Set the maximum expected duration of the specified execution.
+ *
+ * If the device is not able to complete the execution within the specified
+ * duration, the execution may be aborted. The timeout duration begins at a
+ * call to one of:
+ * - {@link ANeuralNetworksExecution_burstCompute}
+ * - {@link ANeuralNetworksExecution_compute}
+ * - {@link ANeuralNetworksExecution_startCompute}
+ * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+ *
+ * This timeout duration acts as a hint to drivers, and can be used to both free
+ * up compute resources within the driver and return control back to the
+ * application quicker than is possible without the hint. It enables drivers
+ * that are able to estimate how long an execution will take to abort the
+ * execution before it has even started if the driver believes the execution
+ * cannot be completed within the timeout duration. Similarly, it enables
+ * drivers to abort an ongoing execution if it is taking too long. However, this
+ * call does not guarantee that the execution will complete or abort within the
+ * timeout duration.
+ *
+ * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+ * the timeout duration for execution is considered infinite.
+ *
+ * The {@link ANeuralNetworksExecution} must have been created from an
+ * {@link ANeuralNetworksCompilation} which in turn was created from
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+ * device has a feature level reported by
+ * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the
+ * timeout duration hint will be ignored.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *     be spent executing a model. If this duration is exceeded, the execution
+ *     may be aborted. If set to 0, the timeout duration is considered infinite.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksExecution_setTimeout(ANeuralNetworksExecution* execution, uint64_t duration)
+        __INTRODUCED_IN(30);
+
+/**
+ * Set the maximum duration of WHILE loops in the specified execution.
+ *
+ * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+ *
+ * If a WHILE loop condition model does not output false within the specified
+ * duration, the execution will be aborted.
+ *
+ * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+ * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+ * and maximum timeout values.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param duration The maximum amount of time in nanoseconds that can be spent
+ *     executing a WHILE loop. If the specified duration value exceeds the value
+ *     produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+ *     overridden by that value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *         ANEURALNETWORKS_BAD_STATE if execution has started.
+ *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksExecution_setLoopTimeout(ANeuralNetworksExecution* execution, uint64_t duration)
+        __INTRODUCED_IN(30);
+
+/**
+ * Get the default timeout value for WHILE loops.
+ *
+ * @return The default timeout value in nanoseconds.
+ *
+ * Available since API level 30.
+ */
+uint64_t ANeuralNetworks_getDefaultLoopTimeout() __INTRODUCED_IN(30);
+
+/**
+ * Get the maximum timeout value for WHILE loops.
+ *
+ * @return The maximum timeout value in nanoseconds.
+ *
+ * Available since API level 30.
+ */
+uint64_t ANeuralNetworks_getMaximumLoopTimeout() __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 /**
  * Waits until the execution completes.
  *
  * More than one thread can wait on an event. When the execution completes,
  * all threads will be released.
  *
+ * If {@link ANeuralNetworksExecution_setTimeout} was called on the execution
+ * corresponding to this event, and the execution is not able to complete
+ * before the duration is exceeded, the execution may be aborted, in which case
+ * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned here.
+ *
+ * If the execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * the execution will be aborted, and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned here.
+ *
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
  * Available since API level 27.
  *
+ * @param event The event that will be signaled on completion.
  * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
  *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory cannot
  *         be properly mapped.
@@ -6432,13 +8027,140 @@ int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) __INTRODUCED_IN(27);
  * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
  *
  * Available since API level 27.
+ *
+ * @param event The event object to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
  */
 void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) __INTRODUCED_IN(27);
 
 #endif  // __ANDROID_API__ >= 27
 
+#if __ANDROID_API__ >= 30
+/**
+ * Create a {@link ANeuralNetworksEvent} from a sync_fence file descriptor.
+ *
+ * The newly created ANeuralNetworksEvent does not take ownership of the provided sync_fence_fd,
+ * it will instead dup the provided sync_fence_fd and own the duplicate.
+ *
+ * @param sync_fence_fd The sync_fence file descriptor.
+ * @param event The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksEvent_createFromSyncFenceFd(int sync_fence_fd, ANeuralNetworksEvent** event)
+        __INTRODUCED_IN(30);
+
+/**
+ * Get sync_fence file descriptor from the event.
+ *
+ * If the ANeuralNetworksEvent is not backed by a sync fence, the sync_fence_fd
+ * will be set to -1, and ANEURALNETWORKS_BAD_DATA will be returned.
+ *
+ * See {@link ANeuralNetworksEvent_createFromSyncFenceFd} and
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies} to see how to create
+ * an event backed by a sync fence.
+ *
+ * The user takes ownership of the returned fd, and must close the returned file descriptor when
+ * it is no longer needed.
+ *
+ * @param event An event that is backed by a sync fence.
+ * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will
+ *                      be set to -1 if there is an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksEvent_getSyncFenceFd(const ANeuralNetworksEvent* event, int* sync_fence_fd)
+        __INTRODUCED_IN(30);
+
+/**
+ * Schedule asynchronous evaluation of the execution with dependencies.
+ *
+ * The execution will wait for all the depending events to be signaled before
+ * starting the evaluation. Once the execution has completed and the outputs
+ * are ready to be consumed, the returned event will be signaled. Depending on which
+ * devices are handling the execution, the event could be backed by a sync fence.
+ * Use {@link ANeuralNetworksEvent_wait} to wait for that event.
+ *
+ * ANeuralNetworksEvent_wait must be called to recurperate the resources used
+ * by the execution.
+ *
+ * If parts of the execution are scheduled on devices that do not support fenced execution,
+ * the function call may wait for such parts to finish before returning.
+ *
+ * The function will return an error if any of the events in dependencies is already in a bad
+ * state. After the execution is scheduled, if any of the events in dependencies does not complete
+ * normally, the execution will fail, and {@link ANeuralNetworksEvent_wait} on the returned
+ * event will return an error.
+ *
+ * The function will return an error if any of the execution outputs has a tensor operand type
+ * that is not fully specified.
+ *
+ * The function can be passed a timeout duration in nanoseconds. This timeout
+ * duration acts as a hint to drivers in the same way that the timeout durations
+ * in {@link ANeuralNetworksCompilation_setTimeout} and {@link
+ * ANeuralNetworksExecution_setTimeout} act as hints to drivers. The duration
+ * begins when all waitFor sync fences have been signaled, and can be used
+ * together with {@link ANeuralNetworksExecution_setTimeout} which specifies the
+ * maximum timeout duration beginning at the call to
+ * {@link ANeuralNetworksExecution_startComputeWithDependencies}.
+ * If the duration is non-zero, the {@link ANeuralNetworksExecution} must have been created
+ * from an {@link ANeuralNetworksCompilation} which in turn was created from
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+ * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If either
+ * the timeout duration from {@link ANeuralNetworksExecution_setTimeout} or the
+ * timeout duration passed to this call is exceeded, the execution may be
+ * aborted, in which case {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be
+ * returned through {@link ANeuralNetworksExecution_startComputeWithDependencies}
+ * or {@link ANeuralNetworksEvent_wait} on the event object. If the device has a
+ * feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that
+ * is lower than 30, then the timeout duration hints will be ignored.
+ *
+ * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and
+ * the condition model does not output false within the loop timeout duration,
+ * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*}
+ * will be returned through {@link ANeuralNetworksEvent_wait} on the event
+ * object.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * See {@link ANeuralNetworksExecution_compute} for synchronous execution.
+ * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution.
+ * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution.
+ *
+ * @param execution The execution to be scheduled and executed.
+ * @param dependencies A set of depending events. The actual evaluation will not start
+ *                     until all the events are signaled.
+ * @param num_dependencies The number of events in the dependencies set.
+ * @param duration The maximum amount of time in nanoseconds that is expected to
+ *                 be spent executing the model after all dependencies are
+ *                 signaled. If set to 0, the timeout duration is considered
+ *                 infinite.
+ * @param event The event that will be signaled on completion. event is set to
+ *              NULL if there's an error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled.
+ *
+ * Available since API level 30.
+ */
+int ANeuralNetworksExecution_startComputeWithDependencies(
+        ANeuralNetworksExecution* execution, const ANeuralNetworksEvent* const* dependencies,
+        uint32_t num_dependencies, uint64_t duration, ANeuralNetworksEvent** event)
+        __INTRODUCED_IN(30);
+
+#endif  // __ANDROID_API__ >= 30
+
 __END_DECLS
 
-#endif  // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+#endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H
+
+// For compatibility with android, check __ANDROID__ is defined
+#ifndef __ANDROID__
+#undef __ANDROID_API__
+#undef __INTRODUCED_IN
+#endif // __ANDROID__
 
 /** @} */
diff --git a/runtime/nnapi-header/include/NeuralNetworksExtensions.h b/runtime/nnapi-header/include/NeuralNetworksExtensions.h
index ca2e04567..dd51b0301 100644
--- a/runtime/nnapi-header/include/NeuralNetworksExtensions.h
+++ b/runtime/nnapi-header/include/NeuralNetworksExtensions.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
-#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
+#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
+#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
 
 #include "NeuralNetworks.h"
 
@@ -37,7 +37,7 @@
 
 __BEGIN_DECLS
 
-#if __ANDROID_API__ >= __ANDROID_API_Q__
+#if __ANDROID_API__ >= 29
 
 /**
  * Queries whether an extension is supported by the driver implementation of the specified device.
@@ -110,8 +110,8 @@ int ANeuralNetworksModel_setOperandExtensionData(ANeuralNetworksModel* model, in
                                                  const void* data, size_t length)
         __INTRODUCED_IN(29);
 
-#endif  // __ANDROID_API__ >= __ANDROID_API_Q__
+#endif  // __ANDROID_API__ >= 29
 
 __END_DECLS
 
-#endif  // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
+#endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H
diff --git a/runtime/onert/api/CMakeLists.txt b/runtime/onert/api/CMakeLists.txt
index 49a5aa071..9c6dd90cc 100644
--- a/runtime/onert/api/CMakeLists.txt
+++ b/runtime/onert/api/CMakeLists.txt
@@ -9,10 +9,16 @@ add_library(${ONERT_DEV} SHARED ${API_SRC})
 set(NNFW_API_HEADERS include/nnfw.h include/nnfw_experimental.h)
 
 target_link_libraries(${ONERT_DEV} PUBLIC nnfw-nnapi-header)
-target_link_libraries(${ONERT_DEV} PUBLIC onert_core)
+target_link_libraries(${ONERT_DEV} PRIVATE onert_core)
 target_link_libraries(${ONERT_DEV} PRIVATE jsoncpp tflite_loader circle_loader ${LIB_PTHREAD})
 target_link_libraries(${ONERT_DEV} PRIVATE nnfw_common)
 target_link_libraries(${ONERT_DEV} PRIVATE nnfw_coverage)
+# NOTE Below line is added to remove warning for android build
+#      It will be removed after android build uses gold linker
+if (ANDROID)
+  target_link_libraries(${ONERT_DEV} INTERFACE log)
+endif (ANDROID)
+
 target_include_directories(${ONERT_DEV} PUBLIC include)
 set_target_properties(${ONERT_DEV} PROPERTIES PUBLIC_HEADER "${NNFW_API_HEADERS}")
 
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 42e43760b..8c6ea3994 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01000900
+#define NNFW_VERSION 0x01000a00
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index 81b40703f..aa066e190 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -112,7 +112,16 @@ NNFW_STATUS nnfw_session::load_circle_from_buffer(uint8_t *buffer, size_t size)
   if (size == 0)
     return NNFW_STATUS_ERROR;
 
-  _subgraphs = onert::circle_loader::loadModel(buffer, size);
+  try
+  {
+    _subgraphs = onert::circle_loader::loadModel(buffer, size);
+  }
+  catch (const std::exception &e)
+  {
+    std::cerr << "Error during model loading : " << e.what() << std::endl;
+    return NNFW_STATUS_ERROR;
+  }
+
   _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs);
 
   _state = State::MODEL_LOADED;
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
index 31f1c10eb..b45b91058 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.cc
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <AclActivationBuilder.h>
+#include <AclFunction.h>
+#include <Convert.h>
+#include <Swizzle.h>
+
 #include "ConstantInitializer.h"
 
 namespace onert
@@ -96,6 +101,46 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node)
   }
 }
 
+void ConstantInitializer::visit(const ir::operation::Reverse &node)
+{
+  const auto &output_index = node.getOutputs().at(0);
+
+  const auto &input_index = node.getInputs().at(ir::operation::Reverse::Input::INPUT);
+  const auto &input_obj = _operands.at(input_index);
+
+  const auto &axis_index = node.getInputs().at(ir::operation::Reverse::Input::AXIS);
+  const auto &axis_obj = _operands.at(axis_index);
+
+  const auto ifm_rank = input_obj.shape().rank();
+  const auto frontend_layout = this->_current_op_seq_layout;
+
+  auto output_tensor = this->_tensor_reg->getITensor(output_index);
+  const auto backend_layout = output_tensor->layout();
+
+  if (axis_obj.isConstant())
+  {
+    _init_map[axis_index] = [ifm_rank, frontend_layout, backend_layout](const ir::Operand &operand,
+                                                                        backend::ITensor &obj) {
+      assert(operand.data());
+
+      const auto axis_value = *(reinterpret_cast<const int32_t *>(operand.data()->base()));
+      int32_t axis_tmp = axis_value;
+      if (axis_tmp < 0)
+      {
+        axis_tmp = axis_tmp + ifm_rank;
+      }
+
+      auto axis =
+          acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout, backend_layout).value();
+
+      obj.access([&](ITensor &tensor) {
+        int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer());
+        *into = (int32_t)axis;
+      });
+    };
+  }
+}
+
 } // namespace acl_cl
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.h b/runtime/onert/backend/acl_cl/ConstantInitializer.h
index 4f894fd31..9f3acb461 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.h
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.h
@@ -38,6 +38,7 @@ public:
   void visit(const ir::operation::Gather &) final;
   void visit(const ir::operation::HashtableLookup &) final;
   void visit(const ir::operation::SpaceToBatchND &) final;
+  void visit(const ir::operation::Reverse &) final;
 };
 
 } // namespace acl_cl
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index 94489253d..cc9afcaeb 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -78,9 +78,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 
   assert(_ctx.at(block_size_index).data());
 
@@ -98,9 +98,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   const auto act_info = acl_common::asActivationLayerInfo(activation);
 
@@ -164,10 +164,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -202,10 +202,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -240,7 +240,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
   std::vector<::arm_compute::ICLTensor *> input_tensors;
   for (auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
@@ -268,7 +268,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
   const auto activation = node.param().activation;
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
@@ -286,8 +286,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto keep_dims{node.param().keep_dims};
   const auto reduce_type = node.param().reduce_type;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
@@ -320,8 +320,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
@@ -351,8 +351,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
   _return_fn = asAclFunction(std::move(fn));
@@ -365,8 +365,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
@@ -382,8 +382,8 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -449,8 +449,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -523,10 +523,23 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     strides_set.set(i, strides[i]);
   }
 
+  // Disable applied dim_correction
+  if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(inputData_tensor);
+  }
+
   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
       begin_mask, end_mask, shrink_axis_mask);
 
+  // Revert disabling applied dim_correction
+  if (inputData_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(inputData_tensor);
+  }
+
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -534,22 +547,47 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 {
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
-  const auto &perm{node.param().perm};
+  const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 
   const auto rank = _ctx.at(ifm_idx).shape().rank();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
 
-  std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
-  // Reversed
-  auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
-      rank, pv, frontend_layout, backend_layout);
+  const auto &perms = _ctx.at(perm_idx);
+  std::vector<int32_t> pv;
+  if (perms.shape() == ir::Shape{0})
+  {
+    pv.resize(rank);
+    std::iota(pv.begin(), pv.end(), 0);
+    std::reverse(pv.begin(), pv.end());
+  }
+  else
+  {
+    pv = _ctx.at(perm_idx).asVector<int32_t>();
+  }
 
-  auto fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
-                                                                ofm_tensor->handle(), backend_pv);
+  std::unique_ptr<arm_compute::IFunction> fn;
+  if (rank == 1)
+  {
+    fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
+  }
+  else if (rank == 2)
+  {
+    assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
+    fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
+                                                             ofm_tensor->handle());
+  }
+  else
+  {
+    auto backend_pv =
+        acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
+
+    fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
+                                                           ofm_tensor->handle(), backend_pv);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -559,8 +597,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
       node.param().op_type, node.param().alpha, node.param().beta);
@@ -577,9 +615,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -626,8 +664,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -647,7 +685,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
       {
         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
                                                             output_tensor->handle());
-        ;
+      }
+      else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
+      {
+        fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
+                                                                output_tensor->handle());
       }
       else
       {
@@ -719,8 +761,8 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
@@ -735,10 +777,10 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
-  auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
+  auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
@@ -764,9 +806,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
-  auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
+  auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
@@ -775,6 +817,56 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
+void KernelGenerator::visit(const ir::operation::OneHot &node)
+{
+  const auto output_idx{node.getOutputs().at(0)};
+  const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
+  const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
+  const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
+  const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
+  const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
+  assert(depth > 0);
+
+  auto output_tensor = _tensor_reg->getAclTensor(output_idx);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
+  auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
+
+  const size_t output_rank = _ctx.at(output_idx).shape().rank();
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto backend_layout = output_tensor->layout();
+  int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
+  axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
+
+  if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and output_tensor is applied dim_correction
+    acl_common::disableDimCorrection(output_tensor);
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  const auto &offvalue = _ctx.at(offvalue_idx);
+  if (offvalue.isConstant())
+  {
+    fn = acl_common::generateLayer<arm_compute::CLOneHot>(
+        indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
+        acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
+  }
+  else
+  {
+    auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
+    fn = acl_common::generateLayer<arm_compute::CLOneHot>(
+        indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
+        output_tensor->handle(), static_cast<uint32_t>(depth), axis);
+  }
+
+  if (output_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(output_tensor);
+  }
+
+  _return_fn = asAclFunction(std::move(fn));
+}
+
 void KernelGenerator::visit(const ir::operation::Pack &node)
 {
   const auto output_index{node.getOutputs().at(0)};
@@ -786,41 +878,39 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : node.getInputs())
     input_indexes.emplace_back(input_index);
 
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
   std::vector<arm_compute::ICLTensor *> inputs;
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 
   if (axis < 0)
     axis += output_rank;
   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes;
   for (const auto &input_index : input_indexes)
   {
-    size_t input_rank = _ctx.at(input_index).shape().rank();
     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
-    orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
-    assert(input_rank == input_tensor->num_dimensions());
-    if (input_rank != input_tensor->info()->num_dimensions())
+    if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
     {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+      // This means that high dimension's value is 1 and input tensor is applied dim_correction
+      acl_common::disableDimCorrection(input_tensor);
     }
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
 
   // Revert disabling applied dim_correction
-  assert(inputs.size() == orig_inputs_acl_tensor_shapes.size());
-  for (size_t i = 0; i < inputs.size(); ++i)
+  for (const auto &input_index : input_indexes)
   {
-    inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i));
+    const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
+    if (input_tensor->dimension(0) == 1)
+    {
+      acl_common::enableDimCorrection(input_tensor);
+    }
   }
 
   _return_fn = asAclFunction(std::move(fn));
@@ -833,7 +923,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
       acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   const auto activation = node.param().activation;
   _return_fn = std::make_unique<exec::FunctionSequence>(
       asAclFunction(std::move(raw_fn)),
@@ -845,8 +935,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -879,11 +969,10 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
@@ -896,11 +985,10 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
       ifm_tensor->handle(), ofm_tensor->handle(),
@@ -925,14 +1013,14 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
-  auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
-  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
-  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
+  auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
+  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
+  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
@@ -954,10 +1042,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
-  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
+  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
@@ -976,8 +1064,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
@@ -991,9 +1079,9 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
@@ -1020,8 +1108,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
@@ -1041,12 +1129,12 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
@@ -1061,9 +1149,9 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
@@ -1096,9 +1184,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
@@ -1116,9 +1204,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
@@ -1140,9 +1228,9 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
 
   const auto k = node.param().k;
 
-  auto values_tensor = _tensor_reg->getAclTensor(outputValues_index).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(inputData_index).get();
+  auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
+  auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
+  auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
       input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
@@ -1162,9 +1250,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
@@ -1187,29 +1275,29 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   assert(k == indices_tensor->num_dimensions());
 
   // Disable applied dim_correction
-  const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
   if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    const auto ifm = _ctx.at(ifm_index);
-    ifm_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(ifm_tensor);
   }
-  const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
   if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
-    const auto indices = _ctx.at(indices_index);
-    indices_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(indices_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // Revert disabling applied dim_correction
-  ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
-  indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+  if (ifm_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(ifm_tensor);
+  }
+  if (indices_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(indices_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1218,19 +1306,20 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
 
   auto ifm_shape = _ctx.at(ifm_index).shape();
   auto ofm_shape = _ctx.at(ofm_index).shape();
 
   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   auto frontend_layout = _current_op_seq_layout;
   auto backend_layout = ifm_tensor->layout();
 
-  int axis_value = node.param().axis;
+  int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis_value < 0)
   {
     axis_value += ifm_rank;
@@ -1239,7 +1328,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
   auto acl_axis =
       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
 
-  auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>(
+  auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
       ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
       ::arm_compute::ReductionOperation::ARG_IDX_MAX);
 
@@ -1257,8 +1346,8 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
@@ -1277,8 +1366,8 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
       input_tensor->handle(), output_tensor->handle(), block_size);
@@ -1289,22 +1378,27 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
 void KernelGenerator::visit(const ir::operation::Split &node)
 {
   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
 
   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+  if (!_ctx.at(axis_index).isConstant())
+  {
+    throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
+  }
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   std::vector<ir::OperandIndex> output_indexes;
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   std::vector<arm_compute::ICLTensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
-  auto axis = node.param().axis;
+  auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
     axis += ifm_rank;
   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
@@ -1315,6 +1409,60 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
+void KernelGenerator::visit(const ir::operation::SplitV &node)
+{
+  const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
+  const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
+  const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
+
+  assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+
+  const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
+  std::vector<ir::OperandIndex> output_indexes;
+  for (const auto &output : node.getOutputs())
+    output_indexes.emplace_back(output);
+
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
+
+  std::vector<arm_compute::ICLTensor *> output_tensors;
+  for (const auto &ofm_ind : output_indexes)
+    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
+
+  auto fn = std::make_unique<arm_compute::CLSplitVEx>();
+  const auto &split_dim_op = _ctx.at(split_dim_index);
+  if (split_dim_op.isConstant())
+  {
+    int32_t split_dim = split_dim_op.asScalar<int32_t>();
+    uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
+    const auto frontend_layout = _current_op_seq_layout;
+    const auto backend_layout = ifm_tensor->layout();
+
+    if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
+    {
+      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+      acl_common::disableDimCorrection(ifm_tensor);
+    }
+
+    split_dim_revised =
+        acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
+            .value();
+    fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
+                  output_tensors, node.param().num_splits);
+
+    if (ifm_tensor->dimension(0) == 1)
+    {
+      acl_common::enableDimCorrection(ifm_tensor);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
+  }
+
+  _return_fn = asAclFunction(std::move(fn));
+}
+
 void KernelGenerator::visit(const ir::operation::Unpack &node)
 {
   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
@@ -1326,34 +1474,32 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : node.getOutputs())
     output_indexes.emplace_back(output_index);
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   std::vector<arm_compute::ICLTensor *> outputs;
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
   if (axis < 0)
     axis += input_rank;
   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
-  for (const auto &output_index : output_indexes)
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
-    size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
-    assert(output_rank == output_tensor->num_dimensions());
-    if (output_rank != output_tensor->info()->num_dimensions())
-    {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
-    }
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(input_tensor);
   }
 
-  auto fn = acl_common::generateLayer<arm_compute::CLUnstack>(input, outputs, axis);
+  auto fn =
+      acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
+
+  // Revert disabling applied dim_correction
+  if (input_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(input_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1373,11 +1519,11 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto input = _tensor_reg->getAclTensor(input_index)->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
 
   ::arm_compute::PaddingList padding_list;
   padding_list.resize(rank);
@@ -1391,21 +1537,26 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   }
 
   // Disable applied dim_correction
-  size_t input_rank = _ctx.at(input_index).shape().rank();
   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
-  assert(input_rank == input_tensor->num_dimensions());
-  if (input_rank != input_tensor->info()->num_dimensions())
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
-    // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-        _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(input_tensor);
   }
 
   auto fn =
       acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
 
-  // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension
-  // It would produce a mistach of result
+  // NOTE Do not revert disabling applied dim_correction for 4D.
+  // It would produce a mistach of result by incorrect offset_first_element in
+  // ICLKernel::add_tensor_argument<3>().
+  // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
+  // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
+  // used tensor is 4D and the tensor's high dimention is 1
+  if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(input_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1415,8 +1566,8 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
@@ -1429,8 +1580,8 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
@@ -1438,6 +1589,30 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   _return_fn = asAclFunction(std::move(fn));
 }
 
+void KernelGenerator::visit(const ir::operation::Reverse &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
+
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
+
+  // WORKAROUND: acl-cl backend only allow U32 type for axis
+  //             ConstantInitializer will resolve S32 type to U32 type
+  if (_ctx.at(axis_index).isConstant() &&
+      (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
+  {
+    axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
+  }
+
+  auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
+      ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
+
+  _return_fn = asAclFunction(std::move(fn));
+}
+
 } // namespace acl_cl
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.h b/runtime/onert/backend/acl_cl/KernelGenerator.h
index d188d6d83..e8a922677 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.h
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.h
@@ -59,6 +59,7 @@ public:
   void visit(const ir::operation::InstanceNorm &) override;
   void visit(const ir::operation::Comparison &) override;
   void visit(const ir::operation::LSTM &) override;
+  void visit(const ir::operation::OneHot &) override;
   void visit(const ir::operation::Pack &) override;
   void visit(const ir::operation::Pool2D &) override;
   void visit(const ir::operation::Permute &) override;
@@ -79,10 +80,12 @@ public:
   void visit(const ir::operation::LocalResponseNormalization &) override;
   void visit(const ir::operation::DepthToSpace &) override;
   void visit(const ir::operation::Split &) override;
+  void visit(const ir::operation::SplitV &) override;
   void visit(const ir::operation::Unpack &) override;
   void visit(const ir::operation::Pad &) override;
   void visit(const ir::operation::ConvertFp32ToFp16 &) override;
   void visit(const ir::operation::ConvertFp16ToFp32 &) override;
+  void visit(const ir::operation::Reverse &) override;
 
 private:
   const ir::Operands &_ctx;
diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
index 372ce689e..257bbd3b4 100644
--- a/runtime/onert/backend/acl_common/AclKernelGen.h
+++ b/runtime/onert/backend/acl_common/AclKernelGen.h
@@ -30,6 +30,20 @@ namespace backend
 namespace acl_common
 {
 
+void enableDimCorrection(IACLTensor *tensor)
+{
+  size_t input_rank = tensor->num_dimensions();
+  const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape())
+      .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), true);
+}
+
+void disableDimCorrection(IACLTensor *tensor)
+{
+  size_t input_rank = tensor->num_dimensions();
+  const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape())
+      .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), false);
+}
+
 template <typename Layer, typename... Args>
 std::unique_ptr<arm_compute::IFunction> generateLayer(Args &&... args)
 {
@@ -138,30 +152,27 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   const auto projection_clip = projection_threshold;
   assert(cell_clip >= 0.f && projection_clip >= 0.f);
 
-  auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index).get();
-  auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index).get();
-  auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index).get();
-  auto output_tensor = tensor_reg->getAclTensor(output_index).get();
+  auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index);
+  auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index);
+  auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index);
+  auto output_tensor = tensor_reg->getAclTensor(output_index);
 
-  auto input_tensor = tensor_reg->getAclTensor(input_index).get();
+  auto input_tensor = tensor_reg->getAclTensor(input_index);
 
-  auto input_to_forget_weights_tensor =
-      tensor_reg->getAclTensor(input_to_forget_weights_index).get();
-  auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index).get();
-  auto input_to_output_weights_tensor =
-      tensor_reg->getAclTensor(input_to_output_weights_index).get();
+  auto input_to_forget_weights_tensor = tensor_reg->getAclTensor(input_to_forget_weights_index);
+  auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index);
+  auto input_to_output_weights_tensor = tensor_reg->getAclTensor(input_to_output_weights_index);
   auto recurrent_to_forget_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_forget_weights_index).get();
-  auto recurrent_to_cell_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_cell_weights_index).get();
+      tensor_reg->getAclTensor(recurrent_to_forget_weights_index);
+  auto recurrent_to_cell_weights_tensor = tensor_reg->getAclTensor(recurrent_to_cell_weights_index);
   auto recurrent_to_output_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_output_weights_index).get();
+      tensor_reg->getAclTensor(recurrent_to_output_weights_index);
 
-  auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index).get();
-  auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index).get();
-  auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index).get();
-  auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index).get();
-  auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index).get();
+  auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index);
+  auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index);
+  auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index);
+  auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index);
+  auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index);
 
   auto act_info = asActivationLayerInfo(activation);
 
@@ -169,13 +180,13 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   if (has_cifg_param)
   {
     auto input_to_input_weights_tensor =
-        tensor_reg->getAclTensor(input_to_input_weights_index).get(); // optional
+        tensor_reg->getAclTensor(input_to_input_weights_index); // optional
     auto recurrent_to_input_weights_tensor =
-        tensor_reg->getAclTensor(recurrent_to_input_weights_index).get(); // optional
+        tensor_reg->getAclTensor(recurrent_to_input_weights_index); // optional
     auto cell_to_input_weights_handle =
-        has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index).get()->handle()
+        has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index)->handle()
                            : nullptr; // optional (non-cifg && peephole)
-    auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index).get(); // optional
+    auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index); // optional
     lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
                                 recurrent_to_input_weights_tensor->handle(),
                                 cell_to_input_weights_handle, input_gate_bias_tensor->handle());
@@ -183,19 +194,18 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   if (has_peephole_param)
   {
     auto cell_to_forget_weights_tensor =
-        tensor_reg->getAclTensor(cell_to_forget_weights_index).get(); // optional
+        tensor_reg->getAclTensor(cell_to_forget_weights_index); // optional
     auto cell_to_output_weights_tensor =
-        tensor_reg->getAclTensor(cell_to_output_weights_index).get(); // optional
+        tensor_reg->getAclTensor(cell_to_output_weights_index); // optional
     lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
                                     cell_to_output_weights_tensor->handle());
   }
   if (has_projection_param)
   {
-    auto projection_weights_tensor =
-        tensor_reg->getAclTensor(projection_weights_index).get(); // optional
-    auto projection_bias_handle =
-        has_projection_bias ? tensor_reg->getAclTensor(projection_bias_index).get()->handle()
-                            : nullptr; // optional
+    auto projection_weights_tensor = tensor_reg->getAclTensor(projection_weights_index); // optional
+    auto projection_bias_handle = has_projection_bias
+                                      ? tensor_reg->getAclTensor(projection_bias_index)->handle()
+                                      : nullptr; // optional
     lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
   }
 
@@ -260,10 +270,10 @@ kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Ope
     reshape.dim(1) = input_size; /* W */
   }
 
-  auto output_tensor = tensor_reg->getAclTensor(output_index).get();
-  const auto input_tensor = tensor_reg->getAclTensor(input_index).get();
-  const auto weight_tensor = tensor_reg->getAclTensor(weight_index).get();
-  const auto bias_tensor = tensor_reg->getAclTensor(bias_index).get();
+  auto output_tensor = tensor_reg->getAclTensor(output_index);
+  const auto input_tensor = tensor_reg->getAclTensor(input_index);
+  const auto weight_tensor = tensor_reg->getAclTensor(weight_index);
+  const auto bias_tensor = tensor_reg->getAclTensor(bias_index);
   const auto frontend_layout = layout;
   const auto acl_layout = output_tensor->handle()->info()->data_layout();
 
@@ -313,8 +323,8 @@ kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands,
   VERBOSE(Pool2DParam) << "PAD(L): " << padding.left << std::endl;
   VERBOSE(Pool2DParam) << "PAD(R): " << padding.right << std::endl;
 
-  auto ofm_tensor = tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = tensor_reg->getAclTensor(ifm_index);
 
   ::arm_compute::PoolingLayerInfo info{
       pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(),
diff --git a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
index 83d7ad6fd..beec95718 100644
--- a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
+++ b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
@@ -61,8 +61,14 @@ public:
 
     for (const auto &ind : inputs)
     {
-      // NOTE Not support the case that concat's input is a constant or a input of model
-      if (_graph.operands().at(ind).isConstant() || _graph.getInputs().contains(ind))
+      /**
+       * NOTE Not support below cases.
+       * 1. concat's input is a constant.
+       * 2. concat's input is a input of model.
+       * 3. concat's input already becomes a subtensor of another concat.
+       */
+      if (_graph.operands().at(ind).isConstant() || _graph.getInputs().contains(ind) ||
+          _parent_map.find(ind) != _parent_map.end())
       {
         return;
       }
diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h
index 91452014b..bb7abc95d 100644
--- a/runtime/onert/backend/acl_common/AclTensorBuilder.h
+++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h
@@ -70,8 +70,6 @@ public:
   void allocate() override;
   void postFunctionPrepare() override;
 
-  std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override;
-
   T_AclTensorManager *acl_tensor_manager(void) { return _tensor_mgr.get(); }
 
   void setUsesCount(const ir::OperandIndex &index, size_t num_uses)
@@ -161,7 +159,6 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::registerTensorInfo(
   else
   {
     // SubTensors
-
     assert(!info.isConstant() && "Subtensors of constants are not supported yet.");
 
     // Update offset info and emplace
@@ -306,13 +303,6 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::postFunctionPrepare(voi
 }
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
-std::unique_ptr<ITensorManager>
-AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::releaseStaticTensorManager(void)
-{
-  return std::move(_tensor_mgr);
-}
-
-template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
 {
   assert(_tensor_mgr->constTensors().size() == 0);
diff --git a/runtime/onert/backend/acl_common/AclTensorRegistry.h b/runtime/onert/backend/acl_common/AclTensorRegistry.h
index 1ef9f4b35..02d66db99 100644
--- a/runtime/onert/backend/acl_common/AclTensorRegistry.h
+++ b/runtime/onert/backend/acl_common/AclTensorRegistry.h
@@ -36,17 +36,11 @@ template <typename T_AclTensorManager> class AclTensorRegistry : public ITensorR
 public:
   AclTensorRegistry(T_AclTensorManager *tensor_mgr) : _tensor_mgr{tensor_mgr} {}
 
-  std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
-  {
-    return _tensor_mgr->at(ind);
-  }
+  ITensor *getITensor(const ir::OperandIndex &ind) override { return _tensor_mgr->at(ind).get(); }
 
-  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
-  {
-    return getITensor(ind);
-  }
+  ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getITensor(ind); }
 
-  auto getAclTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind); }
+  auto getAclTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind).get(); }
 
 private:
   T_AclTensorManager *_tensor_mgr;
diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc
index 67dcc8192..6ef6a2dc3 100644
--- a/runtime/onert/backend/acl_common/Convert.cc
+++ b/runtime/onert/backend/acl_common/Convert.cc
@@ -112,6 +112,8 @@ namespace acl_common
       return ::arm_compute::DataType::S8;
     case ir::DataType::FLOAT16:
       return ::arm_compute::DataType::F16;
+    case ir::DataType::INT64:
+      return ::arm_compute::DataType::S64;
     default:
       throw std::runtime_error("Not supported, yet");
       break;
@@ -299,6 +301,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type)
       return ir::DataType::QUANT_INT8_SYMM;
     case ::arm_compute::DataType::F16:
       return ir::DataType::FLOAT16;
+    case ::arm_compute::DataType::S64:
+      return ir::DataType::INT64;
     default:
       throw std::runtime_error{"Not supported, yet"};
       break;
@@ -335,6 +339,27 @@ arm_compute::ReduceOperation convertReduceType(ir::operation::Reduce::ReduceType
   }
 }
 
+arm_compute::PixelValue asPixelValue(const ir::Operand &operand)
+{
+  assert(operand.isConstant());
+  assert(operand.shape().num_elements() == 1);
+  switch (operand.typeInfo().type())
+  {
+    case ir::DataType::INT32:
+      return arm_compute::PixelValue(operand.asScalar<int32_t>());
+    case ir::DataType::INT64:
+      return arm_compute::PixelValue(operand.asScalar<int64_t>());
+    case ir::DataType::UINT32:
+      return arm_compute::PixelValue(operand.asScalar<uint64_t>());
+    case ir::DataType::UINT8:
+      return arm_compute::PixelValue(operand.asScalar<uint8_t>());
+    case ir::DataType::FLOAT32:
+      return arm_compute::PixelValue(operand.asScalar<float>());
+    default:
+      throw std::runtime_error("asPixelValue : Not supported datatype yet");
+  }
+}
+
 } // namespace acl_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_common/Convert.h b/runtime/onert/backend/acl_common/Convert.h
index 380321c07..0b36df102 100644
--- a/runtime/onert/backend/acl_common/Convert.h
+++ b/runtime/onert/backend/acl_common/Convert.h
@@ -17,6 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_COMMON_CONVERT_H__
 #define __ONERT_BACKEND_ACL_COMMON_CONVERT_H__
 
+#include <arm_compute/core/PixelValue.h>
 #include <arm_compute/core/TensorInfo.h>
 #include <arm_compute/core/SubTensorInfo.h>
 #include <arm_compute/core/TensorShape.h>
@@ -85,6 +86,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type);
 arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir);
 arm_compute::ReduceOperation convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir);
 
+arm_compute::PixelValue asPixelValue(const ir::Operand &operand);
+
 } // namespace acl_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index 6d53c1245..598d043e7 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -18,7 +18,6 @@
 
 #include <arm_compute/runtime/NEON/NEFunctions.h>   // Include all ARM Compute NEON functions
 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
-#include <arm_compute/runtime/CPP/functions/CPPOneHotEx.h>
 
 #include <AclActivationBuilder.h>
 #include <AclFunction.h>
@@ -75,15 +74,16 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   auto frontend_layout = _current_op_seq_layout;
   auto backend_layout = ifm_tensor->layout();
 
-  int axis_value = node.param().axis;
+  int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis_value < 0)
   {
     axis_value += ifm_rank;
@@ -106,9 +106,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 
   assert(_ctx.at(block_size_index).data());
 
@@ -126,9 +126,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().arithmetic_type)
@@ -190,10 +190,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -214,8 +214,8 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
       input_tensor->handle(), output_tensor->handle(), block_size);
@@ -245,10 +245,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -282,7 +282,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
   std::vector<::arm_compute::ITensor *> input_tensors;
   for (const auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
@@ -312,8 +312,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
       node.param().op_type, node.param().alpha, node.param().beta);
@@ -343,9 +343,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -390,8 +390,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   std::unique_ptr<arm_compute::IFunction> fn;
   switch (node.param().op_type)
@@ -412,6 +412,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
         fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
                                                             output_tensor->handle());
       }
+      else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
+      {
+        fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
+                                                                output_tensor->handle());
+      }
       else
       {
         fn = acl_common::generateLayer<arm_compute::NECast>(
@@ -480,9 +485,9 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
@@ -493,7 +498,7 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
   const auto activation = node.param().activation;
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
@@ -512,12 +517,12 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 
-  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
-  auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
-  auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
+  auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
+  auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
+  auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
@@ -539,9 +544,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   // Converting in reverse order
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
   const auto backend_layout = ofm_tensor->layout();
   UNUSED_RELEASE(backend_layout);
 
@@ -567,24 +572,26 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    const auto ifm = _ctx.at(ifm_index);
-    ifm_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(ifm_tensor);
   }
   if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
-    const auto indices = _ctx.at(indices_index);
-    indices_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+    acl_common::disableDimCorrection(indices_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
-  // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
-  // use arm_compute::TensorInfo::offset_element_in_bytes()
-  // It would create an error when the kernel accesses high dimension that its value is 1
+  // Revert disabling applied dim_correction
+  if (ifm_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(ifm_tensor);
+  }
+  if (indices_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(indices_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -596,10 +603,10 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
-  auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
+  auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
@@ -630,8 +637,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
@@ -653,8 +660,8 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
@@ -682,13 +689,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : node.getInputs())
     input_indexes.emplace_back(input_index);
 
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
   std::vector<arm_compute::ITensor *> inputs;
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 
   if (axis < 0)
     axis += output_rank;
@@ -697,22 +704,25 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   // Disable applied dim_correction
   for (const auto &input_index : input_indexes)
   {
-    size_t input_rank = _ctx.at(input_index).shape().rank();
     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
-    assert(input_rank == input_tensor->num_dimensions());
-    if (input_rank != input_tensor->info()->num_dimensions())
+    if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
     {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+      // This means that high dimension's value is 1 and input tensor is applied dim_correction
+      acl_common::disableDimCorrection(input_tensor);
     }
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
 
-  // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
-  // use arm_compute::TensorInfo::offset_element_in_bytes()
-  // It would create an error when the kernel accesses high dimension that its value is 1
+  // Revert disabling applied dim_correction
+  for (const auto &input_index : input_indexes)
+  {
+    const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
+    if (input_tensor->dimension(0) == 1)
+    {
+      acl_common::enableDimCorrection(input_tensor);
+    }
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -727,8 +737,8 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   auto rank = _ctx.at(input_index).shape().rank();
   auto pad_base = _ctx.at(pad_index).data()->base();
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
-  auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
+  auto input = _tensor_reg->getAclTensor(input_index)->handle();
+  auto output = _tensor_reg->getAclTensor(output_index)->handle();
 
   ::arm_compute::PaddingList padding_list;
   padding_list.resize(rank);
@@ -737,7 +747,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 
     const auto frontend_layout = _current_op_seq_layout;
-    const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+    const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
     const auto axis =
         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
@@ -764,7 +774,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
       acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   const auto activation = node.param().activation;
   _return_fn = std::make_unique<exec::FunctionSequence>(
       asAclFunction(std::move(raw_fn)),
@@ -776,8 +786,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -812,9 +822,9 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
@@ -828,8 +838,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
@@ -866,8 +876,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
@@ -887,11 +897,10 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
-
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
@@ -916,14 +925,14 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
-  auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
-  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
-  auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
-  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
+  auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
+  auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
+  auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
+  auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
@@ -949,8 +958,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
   _return_fn = asAclFunction(std::move(fn));
@@ -962,25 +971,26 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
   const auto beta = node.param().beta;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = input_tensor->layout();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   // Disable applied dim_correction
-  const size_t input_rank = _ctx.at(input_index).shape().rank();
-  if (input_rank != input_tensor->info()->num_dimensions())
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and input tensor is applied dim_correction
-    const auto input = _ctx.at(input_index);
-    input_tensor->info()->set_tensor_shape(
-        acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
+    acl_common::disableDimCorrection(input_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
       output_tensor->handle(), beta);
 
+  // Revert disabling applied dim_correction
+  if (input_tensor->dimension(0) == 1)
+  {
+    acl_common::disableDimCorrection(input_tensor);
+  }
+
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -992,10 +1002,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
-  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
+  auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
@@ -1014,8 +1024,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
@@ -1027,22 +1037,27 @@ void KernelGenerator::visit(const ir::operation::Split &node)
 {
   // TODO Support this op by SubTensor
   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
 
   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+  if (!_ctx.at(axis_index).isConstant())
+  {
+    throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
+  }
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   std::vector<ir::OperandIndex> output_indexes;
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   std::vector<arm_compute::ITensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
-  auto axis = node.param().axis;
+  auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
     axis += ifm_rank;
   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
@@ -1059,9 +1074,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
@@ -1076,8 +1091,8 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -1141,8 +1156,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
+  auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = inputData_tensor->layout();
 
@@ -1211,10 +1226,23 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     strides_set.set(i, strides[i]);
   }
 
+  // Disable applied dim_correction
+  if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(inputData_tensor);
+  }
+
   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
       begin_mask, end_mask, shrink_axis_mask);
 
+  // Revert disabling applied dim_correction
+  if (inputData_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(inputData_tensor);
+  }
+
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -1244,9 +1272,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
@@ -1261,26 +1289,43 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 {
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
-  const auto &perm{node.param().perm};
+  const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 
-  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
-  const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
+  auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
+  const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
   const auto frontend_layout = _current_op_seq_layout;
   const auto backend_layout = ifm_tensor->layout();
-
   const auto rank = _ctx.at(ifm_idx).shape().rank();
-  std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
-  auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
-      rank, pv, frontend_layout, backend_layout);
 
-  std::unique_ptr<::arm_compute::IFunction> fn;
-  if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
+  const auto &perms = _ctx.at(perm_idx);
+  std::vector<int32_t> pv;
+  if (perms.shape() == ir::Shape{0})
+  {
+    pv.resize(rank);
+    std::iota(pv.begin(), pv.end(), 0);
+    std::reverse(pv.begin(), pv.end());
+  }
+  else
+  {
+    pv = _ctx.at(perm_idx).asVector<int32_t>();
+  }
+
+  std::unique_ptr<arm_compute::IFunction> fn;
+  if (rank == 1)
   {
+    fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
+  }
+  else if (rank == 2)
+  {
+    assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
     fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
                                                              ofm_tensor->handle());
   }
   else
   {
+    auto backend_pv =
+        acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
+
     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
                                                            ofm_tensor->handle(), backend_pv);
   }
@@ -1298,34 +1343,32 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : node.getOutputs())
     output_indexes.emplace_back(output_index);
 
-  auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
   std::vector<arm_compute::ITensor *> outputs;
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
+  const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
   if (axis < 0)
     axis += input_rank;
   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
-  for (const auto &output_index : output_indexes)
+  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
   {
-    size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
-    assert(output_rank == output_tensor->num_dimensions());
-    if (output_rank != output_tensor->info()->num_dimensions())
-    {
-      // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
-          _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
-    }
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    acl_common::disableDimCorrection(input_tensor);
   }
 
-  auto fn = acl_common::generateLayer<arm_compute::NEUnstack>(input, outputs, axis);
+  auto fn =
+      acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
+
+  // Revert disabling applied dim_correction
+  if (input_tensor->dimension(0) == 1)
+  {
+    acl_common::enableDimCorrection(input_tensor);
+  }
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1335,8 +1378,8 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
@@ -1352,9 +1395,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
-  auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
-  auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
+  auto output_tensor = _tensor_reg->getAclTensor(output_index);
+  auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
+  auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
@@ -1370,15 +1413,20 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
-  const auto axis = node.param().axis;
 
-  auto output_tensor = _tensor_reg->getAclTensor(out_idx).get();
-  auto indices_tensor = _tensor_reg->getAclTensor(indices_idx).get();
-  auto depth_tensor = _tensor_reg->getAclTensor(depth_idx).get();
-  auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx).get();
-  auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx).get();
+  auto output_tensor = _tensor_reg->getAclTensor(out_idx);
+  auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
+  auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
+  auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
+  auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
+
+  const size_t output_rank = _ctx.at(out_idx).shape().rank();
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto backend_layout = output_tensor->layout();
+  int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
+  axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 
-  auto fn = acl_common::generateLayer<arm_compute::CPPOneHotEx>(
+  auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
       indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
       offvalue_tensor->handle(), output_tensor->handle(), axis);
   _return_fn = asAclFunction(std::move(fn));
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
index 6627412d2..32e249f5a 100644
--- a/runtime/onert/backend/cpu/ExternalContext.h
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -39,16 +39,13 @@ public:
   ExternalContext() : _ruy_context(new ruy::Context)
   {
     setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
-#ifdef USE_RUY_GEMV
-    _ruy_context->cache_policy = ruy::kCacheLHSOnNarrowMul;
-#endif
   }
 
   void setMaxNumThreads(int max_num_threads)
   {
     const int target_num_threads =
         max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
-    _ruy_context->max_num_threads = target_num_threads;
+    _ruy_context->set_max_num_threads(target_num_threads);
   }
 
   ruy::Context *ruy_context() const { return _ruy_context.get(); }
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 74b6f0c6b..5f330ff50 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -232,12 +232,10 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
     dyn_ctx->op_seq = &op_seq;
     dyn_ctx->operations = &_operations_ctx;
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->tensor_registry = _tensor_reg;
     dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
 
     _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
   }
-  _return_fn_seq->enableDynamicShapeInferer(true);
 
   _current_op_seq_layout = op_seq.getLayout();
   for (const auto &operation_idx : op_seq.operations())
@@ -272,10 +270,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
 
   const auto stride = node.param().stride;
   const auto activation = node.param().activation;
@@ -332,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
-  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index).get();
-  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
+  auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
+  auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
 
   auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
 
@@ -353,11 +351,11 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   const auto rank = _ctx.at(ofm_index).shape().rank();
   const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   auto fn = std::make_unique<ops::ConcatLayer>();
 
@@ -372,9 +370,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto input_index{node.getInputs().at(ir::operation::BatchToSpaceND::INPUT)};
   const auto block_size_index{node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE)};
 
-  auto output_alloc = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_alloc = _tensor_reg->getPortableTensor(input_index).get();
-  auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index).get();
+  auto output_alloc = _tensor_reg->getPortableTensor(output_index);
+  auto input_alloc = _tensor_reg->getPortableTensor(input_index);
+  auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index);
 
   auto fn = std::make_unique<ops::BatchToSpaceNDLayer>();
 
@@ -384,7 +382,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   if (node.getInputs().size() != NNApiInputs)
   {
     const auto crops_data_index{node.getInputs().at(ir::operation::BatchToSpaceND::CROPS_DATA)};
-    crops_alloc = _tensor_reg->getPortableTensor(crops_data_index).get();
+    crops_alloc = _tensor_reg->getPortableTensor(crops_data_index);
   }
 
   fn->configure(input_alloc, output_alloc, block_size_alloc, crops_alloc);
@@ -398,9 +396,9 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
   const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
   const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto value_tensor = _tensor_reg->getPortableTensor(value_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto value_tensor = _tensor_reg->getPortableTensor(value_index);
 
   auto fn = std::make_unique<ops::FillLayer>();
 
@@ -419,11 +417,10 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
   const auto activation = node.param().activation;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index).get();
-  auto bias_tensor =
-      bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
+  auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
 
   auto fn = std::make_unique<ops::FullyConnectedLayer>();
 
@@ -438,8 +435,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   // optional 2nd input
   IPortableTensor *shape_tensor = nullptr;
@@ -447,7 +444,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   if (node.getInputs().size() == 2)
   {
     const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
-    shape_tensor = _tensor_reg->getPortableTensor(shape_index).get();
+    shape_tensor = _tensor_reg->getPortableTensor(shape_index);
   }
 
   auto fn = std::make_unique<ops::ReshapeLayer>();
@@ -461,8 +458,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   // Squeeze can share same kernel with reshape
   auto fn = std::make_unique<ops::ReshapeLayer>();
@@ -479,8 +476,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::SoftMaxLayer>();
 
@@ -497,9 +494,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::BinaryArithmeticLayer>();
 
@@ -515,9 +512,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto comparison_type = node.param().comparison_type;
 
@@ -534,9 +531,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
 
   const auto backend_layout = output_tensor->layout();
   UNUSED_RELEASE(backend_layout);
@@ -575,11 +572,11 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
 
   const auto axis = node.param().axis;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index).get();
-  auto depth_tensor = _tensor_reg->getPortableTensor(depth_index).get();
-  auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index).get();
-  auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
+  auto depth_tensor = _tensor_reg->getPortableTensor(depth_index);
+  auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index);
+  auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index);
 
   assert(indices_tensor->data_type() == OperandType::INT32);
   assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
@@ -595,10 +592,10 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   const auto equation = node.param().equation;
 
@@ -613,7 +610,7 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
 {
   auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
                           std::vector<custom::TypeInfo> &types,
-                          std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
+                          std::vector<IPortableTensor *> &tensors) {
     for (auto &idx : opSeq)
     {
       const auto &operand = _ctx.at(idx);
@@ -642,8 +639,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::ElementwiseActivationLayer>();
 
@@ -659,9 +656,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::ElementwiseBinaryLayer>();
 
@@ -676,8 +673,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::ElementwiseUnaryLayer>();
 
@@ -692,9 +689,9 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ExpandDimsLayer>();
 
@@ -712,11 +709,11 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 
   assert(-rank <= axis && axis < rank);
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   auto fn = std::make_unique<ops::PackLayer>();
 
@@ -734,11 +731,11 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
 
   assert(rank == 0 || (-rank <= axis && axis < rank));
 
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   std::vector<IPortableTensor *> output_tensors;
   for (auto &output_idx : node.getOutputs())
-    output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get());
+    output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::UnpackLayer>();
 
@@ -756,8 +753,8 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   const auto output_index{node.getOutputs().at(0)};
   assert(_ctx.at(pad_index).data());
 
-  auto input = _tensor_reg->getPortableTensor(input_index).get();
-  auto output = _tensor_reg->getPortableTensor(output_index).get();
+  auto input = _tensor_reg->getPortableTensor(input_index);
+  auto output = _tensor_reg->getPortableTensor(output_index);
   auto pad_rank = _ctx.at(pad_index).shape().dim(0);
   auto pad_base = reinterpret_cast<const int32_t *>(_ctx.at(pad_index).data()->base());
 
@@ -780,13 +777,15 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 {
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+  const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto perm_tensor = _tensor_reg->getPortableTensor(perm_index);
 
   auto fn = std::make_unique<ops::TransposeLayer>();
 
-  fn->configure(input_tensor, output_tensor, node.param().perm);
+  fn->configure(input_tensor, perm_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -798,9 +797,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
   const auto keep_dims = node.param().keep_dims;
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto axes_tensor = _tensor_reg->getPortableTensor(axes_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axes_tensor = _tensor_reg->getPortableTensor(axes_index);
 
   if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
@@ -828,10 +827,10 @@ void KernelGenerator::visit(const ir::operation::Select &node)
   const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
   const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto condition_tensor = _tensor_reg->getPortableTensor(condition_index).get();
-  auto true_tensor = _tensor_reg->getPortableTensor(true_index).get();
-  auto false_tensor = _tensor_reg->getPortableTensor(false_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto condition_tensor = _tensor_reg->getPortableTensor(condition_index);
+  auto true_tensor = _tensor_reg->getPortableTensor(true_index);
+  auto false_tensor = _tensor_reg->getPortableTensor(false_index);
 
   auto fn = std::make_unique<ops::SelectLayer>();
 
@@ -847,10 +846,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto begins_tensor = _tensor_reg->getPortableTensor(begins_index).get();
-  auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto begins_tensor = _tensor_reg->getPortableTensor(begins_index);
+  auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index);
 
   auto fn = std::make_unique<ops::SliceLayer>();
 
@@ -867,11 +866,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto starts_tensor = _tensor_reg->getPortableTensor(starts_index).get();
-  auto ends_tensor = _tensor_reg->getPortableTensor(ends_index).get();
-  auto strides_tensor = _tensor_reg->getPortableTensor(strides_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto starts_tensor = _tensor_reg->getPortableTensor(starts_index);
+  auto ends_tensor = _tensor_reg->getPortableTensor(ends_index);
+  auto strides_tensor = _tensor_reg->getPortableTensor(strides_index);
 
   auto begin_mask = node.param().begin_mask;
   auto end_mask = node.param().end_mask;
@@ -891,19 +890,18 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   assert(num_splits == static_cast<int>(node.getOutputs().size()));
 
   const auto input_idx{node.getInputs().at(ir::operation::Split::Input::INPUT)};
-  const auto rank = _ctx.at(input_idx).shape().rank();
-  const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
-  auto axis_resolved = axis < 0 ? axis + rank : axis;
+  const auto axis_idx{node.getInputs().at(ir::operation::Split::Input::AXIS)};
 
-  auto in_tensor = _tensor_reg->getPortableTensor(input_idx).get();
+  auto in_tensor = _tensor_reg->getPortableTensor(input_idx);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_idx);
 
   std::vector<IPortableTensor *> out_tensors;
   for (auto &output_idx : node.getOutputs())
-    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get());
+    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::SplitLayer>();
 
-  fn->configure(in_tensor, num_splits, axis_resolved, out_tensors);
+  fn->configure(in_tensor, axis_tensor, num_splits, out_tensors);
 
   _return_fn = std::move(fn);
 }
@@ -913,8 +911,8 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
 
   auto fn = std::make_unique<ops::ShapeLayer>();
 
@@ -928,18 +926,37 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::INPUT)};
 
-  auto output_height = node.param().height_out;
-  auto output_width = node.param().width_out;
   auto align_corners = node.param().align_corners;
   auto half_pixel_centers = node.param().half_pixel_centers;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::ResizeBilinearLayer>();
 
-  fn->configure(input_tensor, output_tensor, output_height, output_width, align_corners,
-                half_pixel_centers);
+  if (node.getInputs().size() == 1)
+  {
+    fn->configure(input_tensor, output_tensor, node.param().height_out, node.param().width_out,
+                  align_corners, half_pixel_centers);
+  }
+  else
+  {
+    assert(node.getInputs().size() == 2);
+    const auto size_index{node.getInputs().at(ir::operation::ResizeBilinear::SIZE)};
+    auto size_tensor = _tensor_reg->getPortableTensor(size_index);
+    if (size_tensor->is_constant())
+    {
+      auto size_vec = _ctx.at(size_index).asVector<int32_t>();
+      const auto height_out = size_vec[0];
+      const auto width_out = size_vec[1];
+      fn->configure(input_tensor, output_tensor, height_out, width_out, align_corners,
+                    half_pixel_centers);
+    }
+    else
+    {
+      fn->configure(input_tensor, output_tensor, size_tensor, align_corners, half_pixel_centers);
+    }
+  }
 
   _return_fn = std::move(fn);
 }
@@ -950,9 +967,9 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ReverseLayer>();
 
@@ -965,15 +982,15 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 {
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ArgMax::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::ArgMax::AXIS)};
 
-  const auto axis = node.param().axis;
-
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
 
   auto fn = std::make_unique<ops::ArgMinMaxLayer>();
 
-  fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
+  fn->configure(input_tensor, output_tensor, axis_tensor, /* is_arg_max */ true);
 
   _return_fn = std::move(fn);
 }
@@ -992,8 +1009,8 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
 
   auto fn = std::make_unique<ops::PoolLayer>();
 
@@ -1010,9 +1027,9 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::PowLayer>();
 
@@ -1026,8 +1043,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(0)};
 
-  auto output_alloc = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_alloc = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_alloc = _tensor_reg->getPortableTensor(output_index);
+  auto input_alloc = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::L2NormLayer>();
 
@@ -1043,10 +1060,10 @@ void KernelGenerator::visit(const ir::operation::Range &node)
   const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
   const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto start_tensor = _tensor_reg->getPortableTensor(start_index).get();
-  auto limit_tensor = _tensor_reg->getPortableTensor(limit_index).get();
-  auto delta_tensor = _tensor_reg->getPortableTensor(delta_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto start_tensor = _tensor_reg->getPortableTensor(start_index);
+  auto limit_tensor = _tensor_reg->getPortableTensor(limit_index);
+  auto delta_tensor = _tensor_reg->getPortableTensor(delta_index);
 
   auto fn = std::make_unique<ops::RangeLayer>();
 
@@ -1059,8 +1076,8 @@ void KernelGenerator::visit(const ir::operation::Rank &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
 
   auto fn = std::make_unique<ops::RankLayer>();
 
@@ -1075,9 +1092,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   auto fn = std::make_unique<ops::SqDiffLayer>();
 
@@ -1091,9 +1108,9 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
   const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
   const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index);
 
   auto fn = std::make_unique<ops::TileLayer>();
 
@@ -1108,10 +1125,10 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
   const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
   const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index).get();
-  auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index);
+  auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index);
 
   auto fn = std::make_unique<ops::MatrixBandPartLayer>();
 
@@ -1125,9 +1142,9 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
   const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get();
-  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
+  auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
 
   const auto adj_x = node.param().adj_x;
   const auto adj_y = node.param().adj_y;
@@ -1144,9 +1161,9 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
   const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
   const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto shape_tensor = _tensor_reg->getPortableTensor(shape_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto shape_tensor = _tensor_reg->getPortableTensor(shape_index);
 
   auto fn = std::make_unique<ops::BroadcastToLayer>();
 
@@ -1159,10 +1176,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   const auto epsilon = node.param().epsilon;
   const auto is_training = node.param().is_training;
@@ -1183,8 +1200,8 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
   const auto beta = node.param().beta;
   const auto axis = node.param().axis;
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto fn = std::make_unique<ops::LogSoftMaxLayer>();
 
@@ -1200,10 +1217,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
   const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
 
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index).get();
-  auto padding_tensor = _tensor_reg->getPortableTensor(padding_index).get();
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index);
+  auto padding_tensor = _tensor_reg->getPortableTensor(padding_index);
 
   auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
 
@@ -1218,8 +1235,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
   const auto output_index{node.getOutputs().at(0)};
   auto block_size = node.param().block_size;
 
-  auto input_tensor = _tensor_reg->getPortableTensor(input_index).get();
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index).get();
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
 
   auto fn = std::make_unique<ops::SpaceToDepthLayer>();
 
@@ -1233,9 +1250,9 @@ void KernelGenerator::visit(const ir::operation::StatelessRandomUniform &node)
   const auto shape_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SHAPE)};
   const auto seed_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SEED)};
 
-  auto output_alloc = _tensor_reg->getPortableTensor(output_index).get();
-  auto shape_alloc = _tensor_reg->getPortableTensor(shape_index).get();
-  auto seed_alloc = _tensor_reg->getPortableTensor(seed_index).get();
+  auto output_alloc = _tensor_reg->getPortableTensor(output_index);
+  auto shape_alloc = _tensor_reg->getPortableTensor(shape_index);
+  auto seed_alloc = _tensor_reg->getPortableTensor(seed_index);
 
   auto fn = std::make_unique<ops::StatelessRandomUniformLayer>();
 
@@ -1252,13 +1269,13 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
   const auto size_splits{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
   const auto split_dim{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
 
-  auto in_tensor = _tensor_reg->getPortableTensor(input_idx).get();
-  auto in_size_splits = _tensor_reg->getPortableTensor(size_splits).get();
-  auto in_split_dim = _tensor_reg->getPortableTensor(split_dim).get();
+  auto in_tensor = _tensor_reg->getPortableTensor(input_idx);
+  auto in_size_splits = _tensor_reg->getPortableTensor(size_splits);
+  auto in_split_dim = _tensor_reg->getPortableTensor(split_dim);
 
   std::vector<IPortableTensor *> out_tensors;
   for (auto &output_idx : node.getOutputs())
-    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get());
+    out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::SplitVLayer>();
 
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
index 78c98dabf..3edac897c 100644
--- a/runtime/onert/backend/cpu/StaticTensorManager.cc
+++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
@@ -41,7 +41,7 @@ void StaticTensorManager::allocateNonconsts(void)
   for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
-    auto tensor = pair.second;
+    auto tensor = pair.second.get();
     if (!_as_constants[ind] && !tensor->is_dynamic())
     {
       auto *buffer = _nonconst_mgr->getBuffer(ind);
@@ -62,13 +62,14 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
   assert(!_tensors->getITensor(ind));
   if (as_const)
   {
-    auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
-    _tensors->setNativeTensor(ind, tensor);
+    auto tensor = std::make_unique<ExternalTensor>(tensor_info, backend_layout);
+    _tensors->setNativeTensor(ind, std::move(tensor));
   }
   else
   {
-    auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager);
-    _tensors->setNativeTensor(ind, tensor);
+    auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout,
+                                           _dynamic_tensor_manager->dynamic_mem_mgr().get());
+    _tensors->setNativeTensor(ind, std::move(tensor));
   }
   _as_constants[ind] = as_const;
 }
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensorRegistry.h b/runtime/onert/backend/cpu/Tensor.cc
index fa2a2d54c..dac8f898b 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensorRegistry.h
+++ b/runtime/onert/backend/cpu/Tensor.cc
@@ -14,23 +14,19 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__
-#define __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__
-
-#include "backend/ITensorRegistry.h"
-#include "UserTensor.h"
+#include "Tensor.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace cpu
 {
 
-using UserTensorRegistry = PortableTensorRegistryTemplate<UserTensor>;
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+ExternalTensor::~ExternalTensor() {}
 
-} // namespace controlflow
+} // namespace cpu
 } // namespace backend
 } // namespace onert
-
-#endif // __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index 20e60260c..2ad2ad0fb 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -41,6 +41,7 @@ class ExternalTensor : public Tensor
 {
 public:
   ExternalTensor() = delete;
+  virtual ~ExternalTensor();
 
 public:
   ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout)
@@ -95,6 +96,21 @@ public:
     }
   }
 
+  /**
+   * @brief Reset reference count to zero and release data
+   */
+  void reset_ref() override
+  {
+    assert(_data != nullptr);
+    assert(_num_references > 0);
+    _num_references = 0;
+
+    _data.reset();
+    _buffer = nullptr;
+  }
+
+  int32_t num_references() override { return _num_references; }
+
 private:
   std::shared_ptr<const ir::Data> _data;
 };
diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
index 828d52f7c..e6bc55b0b 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.cc
+++ b/runtime/onert/backend/cpu/TensorBuilder.cc
@@ -85,16 +85,6 @@ void TensorBuilder::allocate()
   //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
 }
 
-std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
-{
-  return std::move(_static_tensor_mgr);
-}
-
-std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
-{
-  return std::move(_dynamic_tensor_mgr);
-}
-
 } // namespace cpu
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index b6d5f09cc..448abc229 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -58,12 +58,8 @@ public:
   void allocate() override;
   void postFunctionPrepare() override { /* DO NOTHING */}
 
-  std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override;
-
   IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); }
 
-  std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) override;
-
 private:
   const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
   std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
index d7b0b2bce..2fd284c91 100644
--- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
@@ -44,24 +44,29 @@ template <typename T> std::function<bool(T, T)> GetComparefunction(bool is_arg_m
 }
 }
 
-void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output, int32_t axis,
-                               bool is_arg_max)
+void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+                               const IPortableTensor *axis, bool is_arg_max)
 {
   _input = input;
   _output = output;
-  if (axis < 0)
-  {
-    axis += input->num_dimensions();
-  }
   _axis = axis;
   _is_arg_max = is_arg_max;
 }
 
 void ArgMinMaxLayer::run()
 {
-#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type)                                 \
-  ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()),     \
-            getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), _axis, \
+  if (_axis->total_size() != sizeof(int32_t))
+  {
+    throw std::runtime_error("ArgMinMax: wrong shape of axis");
+  }
+  auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer());
+  if (axis < 0)
+  {
+    axis += _input->num_dimensions();
+  }
+#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type)                                \
+  ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()),    \
+            getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), axis, \
             GetComparefunction<input_type>(_is_arg_max));
   if (_output->data_type() == ir::DataType::INT32)
   {
diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h
index d7c021624..4c864cb98 100644
--- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h
+++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h
@@ -33,18 +33,18 @@ namespace ops
 class ArgMinMaxLayer : public ::onert::exec::IFunction
 {
 public:
-  ArgMinMaxLayer() : _input(nullptr), _output(nullptr), _axis(-1), _is_arg_max(true) {}
+  ArgMinMaxLayer() : _input(nullptr), _output(nullptr), _axis(nullptr), _is_arg_max(true) {}
 
 public:
-  void configure(const IPortableTensor *indices, IPortableTensor *output, int32_t axis,
-                 bool is_arg_max);
+  void configure(const IPortableTensor *indices, IPortableTensor *output,
+                 const IPortableTensor *axis, bool is_arg_max);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
   IPortableTensor *_output;
-  int32_t _axis;
+  const IPortableTensor *_axis;
   bool _is_arg_max;
 };
 
diff --git a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
index f50c63375..8e51daad5 100644
--- a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
@@ -34,20 +34,21 @@ template <nnfw::cker::BinaryArithmeticOpType arithmetic_type, typename T>
 void eval(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
           nnfw::cker::BinaryArithmeticOpParam op_params)
 {
-  const bool need_broadcast =
-      nnfw::cker::ProcessBroadcastShapes(getTensorShape(lhs), getTensorShape(rhs), &op_params);
+  const auto lhs_shape = getTensorShape(lhs);
+  const auto rhs_shape = getTensorShape(rhs);
+  const bool need_broadcast = nnfw::cker::ProcessBroadcastShapes(lhs_shape, rhs_shape, &op_params);
   if (need_broadcast)
   {
     nnfw::cker::BroadcastBinaryArithmeticOp<arithmetic_type>(
-        op_params, getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-        getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
+        op_params, lhs_shape, reinterpret_cast<const T *>(lhs->buffer()), rhs_shape,
+        reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
         reinterpret_cast<T *>(output->buffer()));
     return;
   }
 
   nnfw::cker::BinaryArithmeticOp<arithmetic_type>(
-      op_params, getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-      getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
+      op_params, lhs_shape, reinterpret_cast<const T *>(lhs->buffer()), rhs_shape,
+      reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
       reinterpret_cast<T *>(output->buffer()));
 }
 
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index 05da33abf..f873a3430 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -158,16 +158,30 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
   op_params.float_activation_max = output_activation_max;
   op_params.activation = convertActivationType(_activation);
 
-  int w0_size = getTensorShape(_weights).Dims(0);
-  const uint16_t *w1_segments = _weights->w1_segments();
-  const uint16_t *w1_indices = _weights->w1_indices();
+  const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
+  const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
 
-  nnfw::cker::FullyConnectedSparseWeight(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments,
-      w1_indices);
+  auto block_size = _weights->sparsity()->block_size();
+  if (block_size.size() == 0)
+  {
+    nnfw::cker::FullyConnectedSparseWeightRandom(
+        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+        getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+        getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments,
+        w1_indices);
+  }
+  else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
+  {
+    nnfw::cker::FullyConnectedSparseWeight16x1(
+        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+        getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+        getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments,
+        w1_indices);
+  }
+  else
+    throw std::runtime_error{"FullyConnected: unsupported sparsity"};
 }
 
 void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
@@ -191,7 +205,7 @@ void FullyConnectedLayer::run()
   {
     fullyConnectedHybrid();
   }
-  else if (_weights->is_sparse())
+  else if (_weights->sparsity())
   {
     fullyConnectedSparseWeight();
   }
@@ -239,17 +253,11 @@ void FullyConnectedLayer::prepare()
   const int rows = getTensorShape(_weights).Dims(0);
   if (rows % 4 == 0)
   {
-    const int total_input_size = getTensorShape(_input).FlatSize();
-    const int input_size = getTensorShape(_weights).Dims(1);
-    const int batch_size = total_input_size / input_size;
-    if (batch_size <= 4)
-    {
-      // TODO If it's possible to extract precaching from ruy kernel,
-      // place this instead of below code
+    // TODO If it's possible to extract precaching from ruy kernel,
+    // place this instead of below code
 
-      // buffer will be used by ruy kernel as a cache key
-      _cached_weights = _weights->buffer();
-    }
+    // buffer will be used by ruy kernel as a cache key
+    _cached_weights = _weights->buffer();
   }
 #endif
 }
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
index 98385521a..eb24dd43c 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
@@ -95,27 +95,18 @@ inline nnfw::cker::Shape getTensorShape(const IPortableTensor *tensor)
   if (tensor == nullptr)
     return nnfw::cker::Shape();
 
+  const ir::Shape &shape = tensor->get_info().shape();
+
   assert(tensor->layout() == ir::Layout::NHWC);
-  constexpr int kMaxSmallSize = 8;
-  int32_t raw_shape_small[kMaxSmallSize];
-  std::vector<int32_t> raw_shape_vec;
-  auto rank = tensor->num_dimensions();
-  int32_t *data = nullptr;
-  if (rank > kMaxSmallSize)
-  {
-    raw_shape_vec.resize(rank);
-    data = raw_shape_vec.data();
-  }
-  else
-  {
-    data = raw_shape_small;
-  }
 
-  for (uint32_t i = 0; i < rank; ++i)
+  auto rank = shape.rank();
+  nnfw::cker::Shape ret(rank);
+  auto data = ret.DimsData();
+  for (int i = 0; i < rank; ++i)
   {
-    data[i] = tensor->dimension(i);
+    data[i] = shape.dim(i);
   }
-  return nnfw::cker::Shape(rank, data);
+  return ret;
 }
 
 inline nnfw::cker::FusedActivationFunctionType
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.cc b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
index bb5f85d60..4a55b2a33 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
@@ -18,6 +18,7 @@
 
 #include "OperationUtils.h"
 
+#include "cker/neon/neon_check.h"
 #include <cker/operation/Reduce.h>
 
 namespace onert
@@ -158,7 +159,7 @@ void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output,
 
 ReduceLayer::ReduceLayer()
     : _input(nullptr), _axes(nullptr), _output(nullptr), _reduce_kernel(new nnfw::cker::Reduce()),
-      _kernel()
+      _kernel(), _reduceType(ReduceType::kInvalid)
 {
   // DO NOTHING
 }
@@ -171,8 +172,9 @@ void ReduceLayer::configure(const IPortableTensor *input, const IPortableTensor
   _input = input;
   _axes = axes;
   _output = output;
+  _reduceType = reduceType;
 
-  switch (reduceType)
+  switch (_reduceType)
   {
     case ReduceType::kSum:
       if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
@@ -199,13 +201,23 @@ void ReduceLayer::configure(const IPortableTensor *input, const IPortableTensor
       _kernel = generateKernelGeneric(_input, keep_dims, *_reduce_kernel, ReduceType::kAll);
       break;
     default:
-      throw std::runtime_error{"ReduceSum: Unsupported reduce type"};
+      throw std::runtime_error{"Reduce: Unsupported reduce type"};
   }
 }
 
 void ReduceLayer::run()
 {
   const auto axes = getReducerAxes(_axes);
+#ifdef USE_NEON
+  int32_t rank = _input->num_dimensions();
+  if (_input->data_type() == ir::DataType::FLOAT32 && _reduceType == ReduceType::kSum &&
+      axes.size() == 1 && (axes[0] == -1 || axes[0] == rank - 1))
+  {
+    OptimizedReduceSum(reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_input),
+                       reinterpret_cast<float *>(_output->buffer()));
+    return;
+  }
+#endif // NEON
   _kernel(_input, _output, axes);
 }
 
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.h b/runtime/onert/backend/cpu/ops/ReduceLayer.h
index 332d399bd..8265dd41f 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.h
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.h
@@ -17,6 +17,8 @@
 #ifndef __ONERT_BACKEND_CPU_OPS_REDUCESUMLAYER_H__
 #define __ONERT_BACKEND_CPU_OPS_REDUCESUMLAYER_H__
 
+#include "cker/neon/neon_check.h"
+
 #include <backend/IPortableTensor.h>
 
 #include <exec/IFunction.h>
@@ -47,6 +49,7 @@ enum class ReduceType
   kMin,
   kAny,
   kAll,
+  kInvalid // For debug and initialize
 };
 
 class ReduceLayer : public ::onert::exec::IFunction
@@ -70,6 +73,8 @@ private:
   std::function<void(const IPortableTensor *input, IPortableTensor *output,
                      const std::vector<int> &axes)>
       _kernel;
+
+  ReduceType _reduceType;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
index 180094bb8..1fe56cb99 100644
--- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
@@ -28,16 +28,39 @@ namespace ops
 {
 
 ResizeBilinearLayer::ResizeBilinearLayer()
-    : _input(nullptr), _output(nullptr), _output_height(0), _output_width(0), _align_corners(false),
-      _half_pixel_centers(false)
+    : _input(nullptr), _output(nullptr), _size(nullptr), _output_height(0), _output_width(0),
+      _align_corners(false), _half_pixel_centers(false)
 {
   // DO NOTHING
 }
 
 void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+                                    const IPortableTensor *size, bool align_corners,
+                                    bool half_pixel_centers)
+{
+  assert(!size->is_constant());
+  _input = input;
+  _output = output;
+  _size = size;
+  _align_corners = align_corners;
+  _half_pixel_centers = half_pixel_centers;
+}
+
+void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output,
                                     int32_t output_height, int32_t output_width, bool align_corners,
                                     bool half_pixel_centers)
 {
+  assert(_size == nullptr);
+  if (output_height < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_height = " +
+                             std::to_string(output_height)};
+  }
+  if (output_width < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_width = " +
+                             std::to_string(output_width)};
+  }
   _input = input;
   _output = output;
   _output_height = output_height;
@@ -49,10 +72,19 @@ void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTenso
 void ResizeBilinearLayer::run()
 {
   nnfw::cker::ResizeBilinearParams params;
+  if (_size == nullptr)
+  {
+    params.output_height = _output_height;
+    params.output_width = _output_width;
+  }
+  else
+  {
+    const auto size_buf = reinterpret_cast<const int32_t *>(_size->buffer());
+    params.output_height = size_buf[0];
+    params.output_width = size_buf[1];
+  }
   params.align_corners = _align_corners;
   params.half_pixel_centers = _half_pixel_centers;
-  params.output_height = _output_height;
-  params.output_width = _output_width;
 
   switch (_input->data_type())
   {
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
index fc49b348e..d7ae1c620 100644
--- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
@@ -36,7 +36,10 @@ public:
   ResizeBilinearLayer();
 
 public:
-  void configure(const IPortableTensor *input1, IPortableTensor *output, int32_t output_height,
+  void configure(const IPortableTensor *input1, IPortableTensor *output,
+                 const IPortableTensor *size, bool align_corners, bool half_pixel_centers);
+
+  void configure(const IPortableTensor *input, IPortableTensor *output, int32_t output_height,
                  int32_t output_width, bool align_corners, bool half_pixel_centers);
 
   void run() override;
@@ -44,6 +47,7 @@ public:
 private:
   const IPortableTensor *_input;
   IPortableTensor *_output;
+  const IPortableTensor *_size;
   int32_t _output_height;
   int32_t _output_width;
   bool _align_corners;
diff --git a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
index 095e67abc..b42be3042 100644
--- a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
@@ -62,7 +62,11 @@ void SoftMaxLayer::softmaxFloat32()
   }
   else
   {
-    throw std::runtime_error{"only 1D, 2D and 4D tensors supported"};
+    nnfw::cker::SoftmaxParams op_params;
+    op_params.beta = _beta;
+    nnfw::cker::reference::Softmax(
+        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.cc b/runtime/onert/backend/cpu/ops/SplitLayer.cc
index 1f40654c1..922cde2e3 100644
--- a/runtime/onert/backend/cpu/ops/SplitLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SplitLayer.cc
@@ -29,7 +29,7 @@ namespace cpu
 namespace ops
 {
 
-SplitLayer::SplitLayer() : _input(nullptr), _num_splits(0), _axis(0), _outputs()
+SplitLayer::SplitLayer() : _input(nullptr), _axis(nullptr), _num_splits(0), _outputs()
 {
   // DO NOTHING
 }
@@ -37,7 +37,16 @@ SplitLayer::SplitLayer() : _input(nullptr), _num_splits(0), _axis(0), _outputs()
 template <typename T> void SplitLayer::split(void)
 {
   nnfw::cker::SplitParams op_params;
-  op_params.axis = _axis;
+  if (_axis->total_size() != sizeof(int32_t))
+  {
+    throw std::runtime_error("ArgMinMax: wrong shape of axis");
+  }
+  auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer());
+  if (axis < 0)
+  {
+    axis += _input->num_dimensions();
+  }
+  op_params.axis = axis;
   op_params.num_split = _num_splits;
 
   std::vector<T *> outputPtrs;
@@ -53,8 +62,8 @@ template <typename T> void SplitLayer::split(void)
                        getTensorShape(_outputs[0]), outputPtrs.data());
 }
 
-void SplitLayer::configure(const IPortableTensor *input, uint16_t num_splits, int16_t axis,
-                           std::vector<IPortableTensor *> &outputs)
+void SplitLayer::configure(const IPortableTensor *input, const IPortableTensor *axis,
+                           uint16_t num_splits, std::vector<IPortableTensor *> &outputs)
 {
   assert(input != nullptr);
 
diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.h b/runtime/onert/backend/cpu/ops/SplitLayer.h
index 0719a0063..090f87166 100644
--- a/runtime/onert/backend/cpu/ops/SplitLayer.h
+++ b/runtime/onert/backend/cpu/ops/SplitLayer.h
@@ -38,15 +38,15 @@ public:
 public:
   template <typename T> void split(void);
 
-  void configure(const IPortableTensor *input, uint16_t num_splits, int16_t axis,
+  void configure(const IPortableTensor *input, const IPortableTensor *axis, uint16_t num_splits,
                  std::vector<IPortableTensor *> &outputs);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
+  const IPortableTensor *_axis;
   uint16_t _num_splits;
-  int16_t _axis;
   std::vector<IPortableTensor *> _outputs;
 };
 
diff --git a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
index dcbb87734..f77f4d691 100644
--- a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
@@ -37,17 +37,17 @@ StridedSliceLayer::StridedSliceLayer()
 
 template <typename T> void StridedSliceLayer::stridedSliceImpl()
 {
+  const auto input_shape = getTensorShape(_input);
+  const auto output_shape = getTensorShape(_output);
   auto op_params = nnfw::cker::buildStridedSliceParams(
       reinterpret_cast<uint32_t *>(_begin->buffer()), reinterpret_cast<uint32_t *>(_end->buffer()),
       reinterpret_cast<uint32_t *>(_strides->buffer()), _begin_mask, _end_mask, _shrink_axis_mask,
-      getTensorShape(_input).DimensionsCount());
+      input_shape.DimensionsCount());
 
-  nnfw::cker::checkOutputSize(op_params, getTensorShape(_input), getTensorShape(_output),
-                              getTensorShape(_input).DimensionsCount());
+  nnfw::cker::checkOutputSize(op_params, input_shape, output_shape, input_shape.DimensionsCount());
 
-  nnfw::cker::StridedSlice(op_params, getTensorShape(_input),
-                           reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
-                           reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::StridedSlice(op_params, input_shape, reinterpret_cast<const T *>(_input->buffer()),
+                           output_shape, reinterpret_cast<T *>(_output->buffer()));
 }
 
 void StridedSliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.cc b/runtime/onert/backend/cpu/ops/TransposeLayer.cc
index 7b232562a..3362c3396 100644
--- a/runtime/onert/backend/cpu/ops/TransposeLayer.cc
+++ b/runtime/onert/backend/cpu/ops/TransposeLayer.cc
@@ -19,6 +19,7 @@
 #include "OperationUtils.h"
 
 #include <cker/operation/Transpose.h>
+#include <numeric>
 
 namespace onert
 {
@@ -29,7 +30,7 @@ namespace cpu
 namespace ops
 {
 
-TransposeLayer::TransposeLayer() : _input(nullptr), _output(nullptr), _perm()
+TransposeLayer::TransposeLayer() : _input(nullptr), _perm(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
@@ -37,10 +38,23 @@ TransposeLayer::TransposeLayer() : _input(nullptr), _output(nullptr), _perm()
 template <typename T> void TransposeLayer::transpose()
 {
   nnfw::cker::TransposeParams param;
-  param.perm_count = _perm.size();
-  for (size_t i = 0; i < _perm.size(); i++)
+  assert(_perm->num_dimensions() == 1);
+
+  param.perm_count = _input->num_dimensions();
+  if (_perm->dimension(0) == 0) // This means _perm is (n-1...0)
+  {
+    const auto begin = param.perm;
+    const auto end = param.perm + _input->num_dimensions();
+    std::iota(begin, end, 0);
+    std::reverse(begin, end);
+  }
+  else
   {
-    param.perm[i] = _perm[i];
+    assert(param.perm_count == static_cast<int>(_perm->dimension(0)));
+    for (auto i = 0; i < param.perm_count; i++)
+    {
+      param.perm[i] = *(reinterpret_cast<const int32_t *>(_perm->buffer()) + i);
+    }
   }
 
   nnfw::cker::Transpose(param, getTensorShape(_input),
@@ -63,8 +77,8 @@ void TransposeLayer::transposeQuant8()
   transpose<uint8_t>();
 }
 
-void TransposeLayer::configure(const IPortableTensor *input, IPortableTensor *output,
-                               const std::vector<int> &perm)
+void TransposeLayer::configure(const IPortableTensor *input, const IPortableTensor *perm,
+                               IPortableTensor *output)
 {
   _input = input;
   _perm = perm;
diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.h b/runtime/onert/backend/cpu/ops/TransposeLayer.h
index f9cb12770..c8e9f8ae7 100644
--- a/runtime/onert/backend/cpu/ops/TransposeLayer.h
+++ b/runtime/onert/backend/cpu/ops/TransposeLayer.h
@@ -40,15 +40,15 @@ public:
 
   void transposeQuant8();
 
-  void configure(const IPortableTensor *input, IPortableTensor *output,
-                 const std::vector<int> &perm);
+  void configure(const IPortableTensor *input, const IPortableTensor *perm,
+                 IPortableTensor *output);
 
   void run() override;
 
 private:
   const IPortableTensor *_input;
+  const IPortableTensor *_perm;
   IPortableTensor *_output;
-  std::vector<int> _perm;
 };
 
 } // namespace ops
diff --git a/runtime/onert/core/CMakeLists.txt b/runtime/onert/core/CMakeLists.txt
index d58b47ced..344b2a972 100644
--- a/runtime/onert/core/CMakeLists.txt
+++ b/runtime/onert/core/CMakeLists.txt
@@ -13,6 +13,11 @@ target_link_libraries(onert_core PRIVATE nnfw_coverage)
 target_link_libraries(onert_core PRIVATE dl ${LIB_PTHREAD})
 target_link_libraries(onert_core PRIVATE jsoncpp)
 target_link_libraries(onert_core INTERFACE ruy_instrumentation)
+# NOTE Below line is added to remove warning for android build
+#      It will be removed after android build uses gold linker
+if (ANDROID)
+  target_link_libraries(onert_core INTERFACE log)
+endif (ANDROID)
 
 if(ENVVAR_ONERT_CONFIG)
   target_compile_definitions(onert_core PRIVATE ENVVAR_FOR_DEFAULT_CONFIG)
diff --git a/runtime/onert/core/include/backend/CustomKernelBuilder.h b/runtime/onert/core/include/backend/CustomKernelBuilder.h
index 101272135..cae2fc1a3 100644
--- a/runtime/onert/core/include/backend/CustomKernelBuilder.h
+++ b/runtime/onert/core/include/backend/CustomKernelBuilder.h
@@ -49,10 +49,10 @@ struct TypeInfo
 
 struct CustomKernelConfigParams
 {
-  std::vector<std::shared_ptr<backend::IPortableTensor>> input_tensors;
+  std::vector<backend::IPortableTensor *> input_tensors;
   std::vector<TypeInfo> input_types;
 
-  std::vector<std::shared_ptr<backend::IPortableTensor>> output_tensors;
+  std::vector<backend::IPortableTensor *> output_tensors;
   std::vector<TypeInfo> output_types;
 
   char *userdata;
diff --git a/runtime/onert/core/include/backend/IDynamicTensorManager.h b/runtime/onert/core/include/backend/IDynamicTensorManager.h
index 343c52c4a..67cfda24e 100644
--- a/runtime/onert/core/include/backend/IDynamicTensorManager.h
+++ b/runtime/onert/core/include/backend/IDynamicTensorManager.h
@@ -39,24 +39,12 @@ struct IDynamicTensorManager : public ITensorManager
 
 public:
   /**
-   * @brief Set new shape and allocate memory for dynamic tensor.
-   *        If a tensor is dynamic tensor and previously allocated memory exists,
-   *        it will be deallocated.
-   *        If a tensor is static tensor (with previously allocated memory by StaticTensorManager),
-   *        tensor->buffer() will be overwrite to the dynamically allocated memory
-   * @param ind             operand index of a tensor
-   * @param new_shape       tensor's new shape. While allocating memory for this new_shape,
-   *                        tensor's shape is set to new_shape
-   */
-  virtual void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) = 0;
-
-  /**
    * @brief Plan when to delete a tensor. Note this planning is done at compilation time.
    * @param op_ind        operation index
-   * @param operand_ind   operand index of input operand of first param op. Operand can be static
+   * @param tensor        candidate ITensor to dealloc. Tensor can be static
    *                      or dynamic since tensor type may not be clearly known at compilation time.
    */
-  virtual void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) = 0;
+  virtual void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) = 0;
 
   /**
    * @brief Deallocate input tensors of op if an input tensor is a dynamic tensor and it won't
@@ -64,12 +52,6 @@ public:
    * @note  This will work after calling planDealloc
    */
   virtual void deallocInput(ir::OperationIndex op_ind) = 0;
-
-  /**
-   * @brief Deallocate an output tensor if the tensor is a dynamic tensor
-   * @note  This will work after calling planDealloc
-   */
-  virtual void deallocSubgraphOutput(ir::OperandIndex ind) = 0;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/IPortableTensor.h b/runtime/onert/core/include/backend/IPortableTensor.h
index a05b39a33..1b1f05fe1 100644
--- a/runtime/onert/core/include/backend/IPortableTensor.h
+++ b/runtime/onert/core/include/backend/IPortableTensor.h
@@ -18,6 +18,8 @@
 #define __ONERT_BACKEND_I_PORTABLE_TENSOR_H__
 
 #include "backend/ITensor.h"
+#include "ir/OperandInfo.h"
+#include "ir/Sparsity.h"
 
 namespace onert
 {
@@ -36,14 +38,18 @@ namespace backend
 class IPortableTensor : public ITensor
 {
 public:
-  virtual ~IPortableTensor() = default;
-  virtual bool is_sparse() const { return false; }
-  virtual const uint16_t *w1_segments() const { return nullptr; }
-  virtual const uint16_t *w1_indices() const { return nullptr; }
+  IPortableTensor(const ir::OperandInfo &info) : _info(info) {}
+
+  virtual ~IPortableTensor();
+  virtual const ir::Sparsity *sparsity() const { return nullptr; }
+  const ir::OperandInfo &get_info() const { return _info; }
 
 public:
   bool has_padding() const final { return false; }
   void access(const std::function<void(ITensor &tensor)> &fn) final { fn(*this); }
+
+protected:
+  ir::OperandInfo _info;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/ITensor.h b/runtime/onert/core/include/backend/ITensor.h
index 12b1c5433..b18dd30a2 100644
--- a/runtime/onert/core/include/backend/ITensor.h
+++ b/runtime/onert/core/include/backend/ITensor.h
@@ -53,13 +53,19 @@ public:
   virtual void access(const std::function<void(ITensor &tensor)> &fn) = 0;
 
   /**
-   * @brief Return the dynamic tensor manager
+   * @brief Set the shape to @c shape and possibly re-allocate the buffer
    *
-   * If dynamic tensors are not supported, it returns @c nullptr .
+   * If a tensor is dynamic tensor and previously allocated memory exists,
+   * it will be deallocated.
+   * If a tensor is static tensor (with previously allocated memory by StaticTensorManager),
+   * @c buffer() will be overwriten
    *
-   * @return IDynamicTensorManager* DynamicTensorManager
+   * @param shape tensor's new shape. While allocating memory for this new_shape,
+   *              tensor's shape is set to new_shape
+   * @return true If applying shape is successful
+   * @return false If not applying shape is not supported (it throws for other errors)
    */
-  virtual IDynamicTensorManager *dynamic_tensor_manager() { return nullptr; }
+  virtual bool applyShape(const ir::Shape &) { return false; }
 
   /**
    * @brief Return true if the tensor is constant
diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
index f93ab81ae..97721cf19 100644
--- a/runtime/onert/core/include/backend/ITensorBuilder.h
+++ b/runtime/onert/core/include/backend/ITensorBuilder.h
@@ -89,14 +89,6 @@ public: // methods for static tensor allocation
    */
   virtual void postFunctionPrepare() = 0;
 
-  /**
-   * @brief Release static @c ITensorManger object which was built
-   *        Before calling this, @c allocate must have been called
-   *
-   * @return std::unique_ptr<ITensorManager> Tensor Manager object
-   */
-  virtual std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) = 0;
-
 public: // methods for dynamic tensor allocation
   /**
    * @brief Get dynamicTensorManager. If a backend does not support dynamic tensor, exception
@@ -108,14 +100,6 @@ public: // methods for dynamic tensor allocation
    *         to the end of execution
    */
   virtual IDynamicTensorManager *dynamicTensorManager(void) { return nullptr; }
-
-  /**
-   * @brief Release dynamic @c ITensorManger object which was built
-   *        Before calling this, @c allocate must have been called
-   *
-   * @return std::unique_ptr<ITensorManager> Tensor Manager object
-   */
-  virtual std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) { return nullptr; }
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
index 88fcb0fcd..b256a1fb8 100644
--- a/runtime/onert/core/include/backend/ITensorRegistry.h
+++ b/runtime/onert/core/include/backend/ITensorRegistry.h
@@ -43,7 +43,7 @@ struct ITensorRegistry
    *
    * @note  Return tensor cannot be used longer than dynamic tensor manager
    */
-  virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
+  virtual ITensor *getITensor(const ir::OperandIndex &) = 0;
   /**
    * @brief Returns pointer of ITensor among native tensors
    *
@@ -51,17 +51,14 @@ struct ITensorRegistry
    *
    * @note  Returned tensor cannot be used longer than dynamic tensor manager
    */
-  virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
+  virtual ITensor *getNativeITensor(const ir::OperandIndex &) = 0;
   /**
    * @brief Set the Migrant Tensor which are from other backends
    *
    * @return true if supported
    * @return false if not supported
    */
-  virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
-  {
-    return false;
-  }
+  virtual bool setMigrantTensor(const ir::OperandIndex &, IPortableTensor *) { return false; }
 };
 
 } // namespace backend
@@ -85,41 +82,37 @@ namespace backend
 template <typename T_Tensor> class PortableTensorRegistryTemplate : public ITensorRegistry
 {
 public:
-  std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
+  ITensor *getITensor(const ir::OperandIndex &ind) override
   {
     static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
-    auto external_tensor = _migrant.find(ind);
-    if (external_tensor != _migrant.end())
-      return external_tensor->second;
+    auto _migrant_tensor = _migrant.find(ind);
+    if (_migrant_tensor != _migrant.end())
+      return _migrant_tensor->second;
     return getNativeTensor(ind);
   }
 
-  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
-  {
-    return getNativeTensor(ind);
-  }
+  ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getNativeTensor(ind); }
 
-  std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
+  IPortableTensor *getPortableTensor(const ir::OperandIndex &ind)
   {
-    auto external_tensor = _migrant.find(ind);
-    if (external_tensor != _migrant.end())
+    auto _migrant_tensor = _migrant.find(ind);
+    if (_migrant_tensor != _migrant.end())
     {
-      if (external_tensor->second)
-        return external_tensor->second;
+      if (_migrant_tensor->second)
+        return _migrant_tensor->second;
     }
     return getNativeTensor(ind);
   }
 
-  std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
+  T_Tensor *getNativeTensor(const ir::OperandIndex &ind)
   {
     auto tensor = _native.find(ind);
     if (tensor != _native.end())
-      return tensor->second;
+      return tensor->second.get();
     return nullptr;
   }
 
-  bool setMigrantTensor(const ir::OperandIndex &ind,
-                        const std::shared_ptr<IPortableTensor> &tensor) override
+  bool setMigrantTensor(const ir::OperandIndex &ind, IPortableTensor *tensor) override
   {
     assert(tensor != nullptr);
     auto itr = _native.find(ind);
@@ -129,25 +122,22 @@ public:
     return true;
   }
 
-  void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+  void setNativeTensor(const ir::OperandIndex &ind, std::unique_ptr<T_Tensor> &&tensor)
   {
     assert(tensor != nullptr);
     auto itr = _migrant.find(ind);
     if (itr != _migrant.end())
       throw std::runtime_error{"Tried to set a native tensor but a migrant tensor already exists."};
-    _native[ind] = tensor;
+    _native[ind] = std::move(tensor);
   }
 
-  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
+  const ir::OperandIndexMap<std::unique_ptr<T_Tensor>> &native_tensors() { return _native; }
 
-  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
-  {
-    return _migrant;
-  }
+  const ir::OperandIndexMap<IPortableTensor *> &migrant_tensors() { return _migrant; }
 
 private:
-  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
-  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
+  ir::OperandIndexMap<IPortableTensor *> _migrant;
+  ir::OperandIndexMap<std::unique_ptr<T_Tensor>> _native;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h b/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
index e3c8c8666..c4e06aa82 100644
--- a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
@@ -44,14 +44,16 @@ public:
 
   virtual ~DynamicTensorManager() = default;
 
-  void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) override;
-
   void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
                    ir::Layout backend_layout);
 
-  void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) override;
+  void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override;
   void deallocInput(ir::OperationIndex op_ind) override;
-  void deallocSubgraphOutput(ir::OperandIndex ind) override;
+
+  std::shared_ptr<DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; }
+
+private:
+  const ITensor *getRawITensor(ir::OperandIndex ind);
 
 private:
   /**
@@ -63,7 +65,8 @@ private:
 
   // contains list of dynamic tensor index, which can be deallocated after running operation
   // note: this map could contain static tensor index too. Careful use is required.
-  std::unordered_map<ir::OperationIndex, std::unordered_set<ir::OperandIndex>> _dealloc_tensor_map;
+  std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>>
+      _dealloc_tensor_map;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h b/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
index 4be7a1a11..28ec6b803 100644
--- a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
@@ -20,12 +20,14 @@
 #include "Allocator.h"
 #include "backend/IMemoryManager.h"
 #include "IMemoryPlanner.h"
-#include "ir/OperandIndexMap.h"
 
 namespace onert
 {
 namespace backend
 {
+
+class ITensor;
+
 namespace cpu_common
 {
 
@@ -59,12 +61,12 @@ public:
   DynamicMemoryManager() = default;
   virtual ~DynamicMemoryManager() = default;
 
-  std::shared_ptr<Allocator> allocate(const ir::OperandIndex &ind, uint32_t capacity);
-  void deallocate(const ir::OperandIndex &ind);
+  std::shared_ptr<Allocator> allocate(const ITensor *tensor, uint32_t capacity);
+  void deallocate(const ITensor *tensor);
   void deallocate(void);
 
 private:
-  ir::OperandIndexMap<std::shared_ptr<Allocator>> _mem_alloc_map;
+  std::unordered_map<const ITensor *, std::shared_ptr<Allocator>> _mem_alloc_map;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
index 3f09b7a4a..fa50b551e 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
@@ -20,7 +20,6 @@
 #include "MemoryManager.h"
 
 #include "backend/IStaticTensorManager.h"
-#include "backend/IDynamicTensorManager.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "TensorRegistry.h"
@@ -32,11 +31,13 @@ namespace backend
 namespace cpu_common
 {
 
+class DynamicTensorManager;
+
 class StaticTensorManager : public backend::IStaticTensorManager
 {
 public:
   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
-                      IDynamicTensorManager *dynamic_tensor_manager);
+                      DynamicMemoryManager *dynamic_mem_mgr);
   virtual ~StaticTensorManager() = default;
 
   void allocateConsts(void);
@@ -57,7 +58,7 @@ private:
   std::unique_ptr<MemoryManager> _nonconst_mgr;
   const std::shared_ptr<TensorRegistry> _tensors;
   ir::OperandIndexMap<bool> _as_constants;
-  IDynamicTensorManager *_dynamic_tensor_manager;
+  DynamicMemoryManager *_dynamic_mem_mgr;
 };
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/cpu_common/Tensor.h
index 974501ecb..5fa20e15d 100644
--- a/runtime/onert/core/include/backend/cpu_common/Tensor.h
+++ b/runtime/onert/core/include/backend/cpu_common/Tensor.h
@@ -29,16 +29,19 @@ namespace backend
 namespace cpu_common
 {
 
+class DynamicMemoryManager;
+
 class Tensor : public IPortableTensor
 {
 public:
   Tensor() = delete;
+  virtual ~Tensor();
 
 public:
   Tensor(const ir::OperandInfo &info, const ir::Layout layout,
-         IDynamicTensorManager *dynamic_tensor_manager)
-      : _info(info), _layout(layout), _buffer(nullptr), _num_references(0),
-        _dynamic_tensor_manager(dynamic_tensor_manager), _allocator(nullptr)
+         DynamicMemoryManager *dynamic_mem_mgr)
+      : IPortableTensor(info), _layout(layout), _buffer(nullptr), _num_references(0),
+        _dynamic_mem_mgr(dynamic_mem_mgr), _allocator(nullptr)
   {
     // DO NOTHING
   }
@@ -94,7 +97,7 @@ public:
    *       W : dimension(2)
    *       C : dimension(3)
    */
-  size_t dimension(size_t index) const override { return _info.shape().dim(index); }
+  size_t dimension(size_t index) const final override { return _info.shape().dim(index); }
   size_t num_dimensions() const override { return _info.shape().rank(); }
   size_t total_size() const override { return _info.total_size(); }
   size_t calcOffset(const ir::Coordinates &coords) const override;
@@ -105,10 +108,8 @@ public:
   bool is_constant() const override { return _info.isConstant(); }
   bool is_dynamic() const override { return _info.isDynamic(); }
   void set_dynamic() override { _info.setDynamic(); }
-  IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; }
-  bool is_sparse() const override { return _info.typeInfo().sparse(); }
-  virtual const uint16_t *w1_segments() const override { return _info.typeInfo().w1_segments(); }
-  virtual const uint16_t *w1_indices() const override { return _info.typeInfo().w1_indices(); }
+  bool applyShape(const ir::Shape &new_shape) override;
+  const ir::Sparsity *sparsity() const override { return _info.typeInfo().sparsity(); }
 
   virtual void increase_ref()
   {
@@ -118,6 +119,7 @@ public:
 
     ++_num_references;
   }
+
   virtual void decrease_ref()
   {
     assert(_buffer != nullptr || _allocator != nullptr);
@@ -136,14 +138,34 @@ public:
     }
   }
 
+  /**
+   * @brief Reset reference count to zero and release data
+   */
+  virtual void reset_ref()
+  {
+    assert(_buffer != nullptr || _allocator != nullptr);
+    assert(_num_references > 0);
+    _num_references = 0;
+
+    // Only constant tensor has allocator pointer
+    if (_buffer != nullptr)
+      _buffer = nullptr;
+    else
+    {
+      _allocator->release();
+      _allocator = nullptr;
+    }
+  }
+
+  virtual int32_t num_references() { return _num_references; }
+
   void setShape(const ir::Shape &new_shape) override;
 
 protected:
-  ir::OperandInfo _info;
   ir::Layout _layout;
   uint8_t *_buffer;
   int32_t _num_references;
-  IDynamicTensorManager *_dynamic_tensor_manager;
+  DynamicMemoryManager *_dynamic_mem_mgr;
 
 private:
   /**
diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
index b97cb5b7b..5af11074e 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
@@ -70,6 +70,8 @@ private:
   // TODO Define visitors for operations. List them in alphabetic order.
   void visit(const ir::operation::ArgMax &op) override;
   void visit(const ir::operation::BatchMatMul &op) override;
+  void visit(const ir::operation::BCQFullyConnected &op) override;
+  void visit(const ir::operation::BCQGather &op) override;
   void visit(const ir::operation::BinaryArithmetic &op) override;
   void visit(const ir::operation::BroadcastTo &op) override;
   void visit(const ir::operation::Comparison &op) override;
@@ -85,6 +87,7 @@ private:
   void visit(const ir::operation::Gather &op) override;
   void visit(const ir::operation::If &op) override;
   void visit(const ir::operation::L2Normalization &op) override;
+  void visit(const ir::operation::LSTM &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::OneHot &op) override;
   void visit(const ir::operation::Pack &op) override;
diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
index 6f6659659..4a86708d0 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
@@ -51,6 +51,8 @@ public:
   // Remove TODO when any op starting from the alphabet is added
   void visit(const ir::operation::ArgMax &op) override;
   void visit(const ir::operation::BatchMatMul &op) override;
+  void visit(const ir::operation::BCQFullyConnected &op) override;
+  void visit(const ir::operation::BCQGather &op) override;
   void visit(const ir::operation::BinaryArithmetic &op) override;
   void visit(const ir::operation::BroadcastTo &op) override;
   void visit(const ir::operation::Comparison &op) override;
@@ -65,6 +67,7 @@ public:
   void visit(const ir::operation::FusedBatchNorm &op) override;
   void visit(const ir::operation::Gather &op) override;
   void visit(const ir::operation::L2Normalization &op) override;
+  void visit(const ir::operation::LSTM &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::OneHot &op) override;
   void visit(const ir::operation::Pack &op) override;
diff --git a/runtime/onert/core/include/exec/FunctionSequence.h b/runtime/onert/core/include/exec/FunctionSequence.h
index 79a58ed00..49f00dba1 100644
--- a/runtime/onert/core/include/exec/FunctionSequence.h
+++ b/runtime/onert/core/include/exec/FunctionSequence.h
@@ -79,7 +79,6 @@ public: // methods related to dynamic tensor
     const ir::OpSequence *op_seq = nullptr;
     const ir::Operations *operations = nullptr;
     std::shared_ptr<exec::DynamicShapeInferer> dynamic_shape_inferer = nullptr;
-    std::shared_ptr<backend::ITensorRegistry> tensor_registry = nullptr;
     backend::IDynamicTensorManager *dynamic_tensor_manager = nullptr;
   };
 
@@ -104,14 +103,25 @@ public: // methods related to dynamic tensor
    */
   void enableDynamicShapeInferer(bool enable)
   {
-    _enable_dynamic_shape_inferer = _enable_dynamic_shape_inferer && enable;
+    _enable_dynamic_shape_inferer = _enable_dynamic_shape_inferer || enable;
   }
 
+  /**
+   * @brief Call this function to initialize vars before running
+   * @note When we run a model with static tensor input and then run with dynamic tensor input,
+   *       _enable_dynamic_shape_inferer is set to @c false at first run.
+   *       Once _enable_dynamic_shape_inferer is set to @c true it cannot be changed to @c false
+   *       only with calling enableDynamicShapeInferer(). So initializing it to @c false is
+   *       necessary.
+   * @todo This is a quick fix. Adding this will increase time for run(). Find way to optimize.
+   */
+  void initRunning() { _enable_dynamic_shape_inferer = false; }
+
 protected:
   std::vector<std::unique_ptr<IFunction>> _functions;
 
 protected:
-  bool _enable_dynamic_shape_inferer = true;
+  bool _enable_dynamic_shape_inferer = false;
 
   std::shared_ptr<DynamicTensorCtx> _dynamic_tensor_ctx = nullptr;
 };
diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h
index 6c8bab67c..1d2831dd0 100644
--- a/runtime/onert/core/include/exec/IExecutor.h
+++ b/runtime/onert/core/include/exec/IExecutor.h
@@ -69,21 +69,6 @@ struct IExecutor
 
 using ExecutorMap = std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>>;
 
-// TODO Move this structure to suitable place
-/**
- * @brief Dynamic allocation info for input tensors
- *        When user sets shape of input having unknown dims after compilation, memory for the input
- * should be allocated before executing kernels. This struct contains information to allocate
- * memory.
- */
-struct DynAllocInfo
-{
-  /// @brief index of input tensor whose memory needs to be allocated at execution time
-  ir::OperandIndex ind;
-};
-
-using DynAllocInfoMap = std::unordered_map<std::shared_ptr<backend::ITensor>, DynAllocInfo>;
-
 } // namespace exec
 } // namespace onert
 
diff --git a/runtime/onert/core/include/ir/Operand.h b/runtime/onert/core/include/ir/Operand.h
index 1b3a43b02..f149a744b 100644
--- a/runtime/onert/core/include/ir/Operand.h
+++ b/runtime/onert/core/include/ir/Operand.h
@@ -40,6 +40,7 @@ public:
   {
     // DO NOTHING
   }
+  explicit Operand(const Operand &) = default;
 
 public:
   const Shape &shape(void) const { return _info.shape(); }
diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h
index aa01eccaa..2f78cc832 100644
--- a/runtime/onert/core/include/ir/OperandIndexSequence.h
+++ b/runtime/onert/core/include/ir/OperandIndexSequence.h
@@ -82,6 +82,8 @@ public:
 public:
   std::vector<OperandIndex>::const_iterator begin(void) const { return _vec.begin(); }
   std::vector<OperandIndex>::const_iterator end(void) const { return _vec.end(); }
+  std::vector<OperandIndex>::iterator begin(void) { return _vec.begin(); }
+  std::vector<OperandIndex>::iterator end(void) { return _vec.end(); }
 
 private:
   std::vector<OperandIndex> _vec;
diff --git a/runtime/onert/core/include/ir/OperandInfo.h b/runtime/onert/core/include/ir/OperandInfo.h
index b8e123027..67aeb0e65 100644
--- a/runtime/onert/core/include/ir/OperandInfo.h
+++ b/runtime/onert/core/include/ir/OperandInfo.h
@@ -117,6 +117,7 @@ public:
 
   MemAllocType memAllocType() const { return _alloc_type; }
   void setAsConstant() { _const = true; }
+  void setAsNonConst() { _const = false; }
   bool isConstant() const
   {
     // Impossible case: constant and dynamic operand
diff --git a/runtime/onert/core/include/ir/Operation.h b/runtime/onert/core/include/ir/Operation.h
index 818bd913b..89f7e340d 100644
--- a/runtime/onert/core/include/ir/Operation.h
+++ b/runtime/onert/core/include/ir/Operation.h
@@ -34,9 +34,12 @@ struct OperationVisitor;
 class Operation
 {
 public:
+  // TODO Remove default parameter
   Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs,
-            const OperandIndexSequence &outputs);
-  explicit Operation(OperandConstraint input_constr);
+            const OperandIndexSequence &outputs,
+            OperandConstraint output_constr = OperandConstraint::createAny());
+  explicit Operation(OperandConstraint input_constr,
+                     OperandConstraint output_constr = OperandConstraint::createAny());
 
   Operation(const Operation &) = default;
   Operation(Operation &&) = default;
@@ -62,6 +65,7 @@ public:
 
 private:
   OperandConstraint _input_constr;
+  OperandConstraint _output_constr;
   OperandIndexSequence _inputs;
   OperandIndexSequence _outputs;
 };
diff --git a/runtime/onert/core/include/ir/Sparsity.h b/runtime/onert/core/include/ir/Sparsity.h
new file mode 100644
index 000000000..ad4d8259b
--- /dev/null
+++ b/runtime/onert/core/include/ir/Sparsity.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+#ifndef __ONERT_IR_SPARSITY_H__
+#define __ONERT_IR_SPARSITY_H__
+
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+namespace onert
+{
+namespace ir
+{
+
+/**
+ * @brief  Structure for Sparse Tensor
+ */
+struct Sparsity
+{
+public:
+  Sparsity() = default;
+  Sparsity(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices,
+           std::vector<int32_t> &&block_size)
+      : _w1_segments(w1_segments), _w1_indices(w1_indices), _block_size(block_size)
+  {
+  }
+
+  /**
+   * @brief Returns segments array. See compressed sparse row format.
+   */
+  const uint16_t *w1_segments() const { return _w1_segments.data(); }
+  /**
+   * @brief Returns indices array. See compressed sparse row format.
+   */
+  const uint16_t *w1_indices() const { return _w1_indices.data(); }
+  /**
+   * @brief Returns block size which is used for block sparsity
+   */
+  const std::vector<int32_t> &block_size() const { return _block_size; }
+
+private:
+  std::vector<uint16_t> _w1_segments;
+  std::vector<uint16_t> _w1_indices;
+  std::vector<int32_t> _block_size;
+};
+
+} // namespace ir
+} // namespace onert
+
+#endif // __ONERT_IR_SPARSITY_H__
diff --git a/runtime/onert/core/include/ir/TypeInfo.h b/runtime/onert/core/include/ir/TypeInfo.h
index 3f7eab4c0..a1ae4d2e4 100644
--- a/runtime/onert/core/include/ir/TypeInfo.h
+++ b/runtime/onert/core/include/ir/TypeInfo.h
@@ -18,9 +18,11 @@
 #define __ONERT_IR_TYPEINFO_H__
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "ir/DataType.h"
+#include "ir/Sparsity.h"
 
 namespace onert
 {
@@ -33,7 +35,7 @@ public:
   TypeInfo() = delete;
 
   explicit TypeInfo(DataType type, float scale = 0, int32_t offset = 0)
-      : _type(type), _scale(scale), _offset(offset), _sparse(false)
+      : _type(type), _scale(scale), _offset(offset), _sparsity(nullptr)
   {
   }
 
@@ -41,18 +43,11 @@ public:
   DataType type() const { return _type; }
   float scale() const { return _scale; }
   int32_t offset() const { return _offset; }
-  bool sparse() const { return _sparse; }
-  const uint16_t *w1_segments() const { return _w1_segments.data(); }
-  const uint16_t *w1_indices() const { return _w1_indices.data(); }
+  const ir::Sparsity *sparsity() const { return _sparsity.get(); }
+  void sparsity(std::shared_ptr<ir::Sparsity> sparsity) { _sparsity = sparsity; }
 
 public:
   void type(const DataType type) { _type = type; }
-  void sparse2DMetadata(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices)
-  {
-    _sparse = true;
-    _w1_segments = w1_segments;
-    _w1_indices = w1_indices;
-  }
 
 private:
   DataType _type;
@@ -60,9 +55,7 @@ private:
   float _scale;
   int32_t _offset;
   // for sparsity
-  bool _sparse;
-  std::vector<uint16_t> _w1_segments;
-  std::vector<uint16_t> _w1_indices;
+  std::shared_ptr<ir::Sparsity> _sparsity;
 };
 
 bool operator==(const TypeInfo &lhs, const TypeInfo &rhs);
diff --git a/runtime/onert/core/include/ir/operation/ArgMax.h b/runtime/onert/core/include/ir/operation/ArgMax.h
index 8400e1f1e..ea7eabb83 100644
--- a/runtime/onert/core/include/ir/operation/ArgMax.h
+++ b/runtime/onert/core/include/ir/operation/ArgMax.h
@@ -31,12 +31,12 @@ class ArgMax : public Operation
 public:
   enum Input
   {
-    INPUT
+    INPUT = 0,
+    AXIS = 1
   };
 
   struct Param
   {
-    int axis;
     DataType output_type;
   };
 
diff --git a/runtime/onert/core/include/ir/operation/LSTM.h b/runtime/onert/core/include/ir/operation/LSTM.h
index 1e6c00bf3..027bc6b42 100644
--- a/runtime/onert/core/include/ir/operation/LSTM.h
+++ b/runtime/onert/core/include/ir/operation/LSTM.h
@@ -26,6 +26,7 @@ namespace ir
 namespace operation
 {
 
+// This operation supports only unidirectional sequence lstm
 class LSTM : public Operation
 {
 public:
@@ -51,6 +52,10 @@ public:
     PROJECTION_BIAS = 17,
     OUTPUT_STATE_IN = 18,
     CELL_STATE_IN = 19,
+    INPUT_LAYER_NORMALIZATION_WEIGHTS = 20,
+    FORGET_LAYER_NORMALIZATION_WEIGHTS = 21,
+    CELL_LAYER_NORMALIZATION_WEIGHTS = 22,
+    OUTPUT_LAYER_NORMALIZATION_WEIGHTS = 23,
   };
 
   enum Output
@@ -66,6 +71,7 @@ public:
     Activation activation;
     float cell_threshold;
     float projection_threshold;
+    bool time_major;
   };
 
 public:
@@ -73,6 +79,7 @@ public:
 
 public:
   void accept(OperationVisitor &v) const override;
+  std::string name() const override;
   OpCode opcode() const final { return OpCode::LSTM; }
 
 public:
diff --git a/runtime/onert/core/include/ir/operation/ResizeBilinear.h b/runtime/onert/core/include/ir/operation/ResizeBilinear.h
index 29aa496d7..ab330c826 100644
--- a/runtime/onert/core/include/ir/operation/ResizeBilinear.h
+++ b/runtime/onert/core/include/ir/operation/ResizeBilinear.h
@@ -34,10 +34,12 @@ public:
   enum Input
   {
     INPUT = 0,
+    SIZE = 1,
   };
 
   struct Param
   {
+    // If the input SIZE exists in inputs, height_out and width_out are not set. Ignore these params
     int32_t height_out;
     int32_t width_out;
     bool align_corners;
diff --git a/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h b/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h
index e4d810eeb..10827803e 100644
--- a/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h
+++ b/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h
@@ -34,10 +34,12 @@ public:
   enum Input
   {
     INPUT = 0,
+    SIZE = 1,
   };
 
   struct Param
   {
+    // If the input SIZE exists in inputs, Be height_out and width_out not set. Ignore these params
     int32_t height_out;
     int32_t width_out;
     bool align_corners;
diff --git a/runtime/onert/core/include/ir/operation/Split.h b/runtime/onert/core/include/ir/operation/Split.h
index 60e0fdf15..c415941a4 100644
--- a/runtime/onert/core/include/ir/operation/Split.h
+++ b/runtime/onert/core/include/ir/operation/Split.h
@@ -29,12 +29,12 @@ class Split : public Operation
 public:
   enum Input
   {
-    INPUT = 0
+    AXIS = 0,
+    INPUT = 1,
   };
 
   struct Param
   {
-    int axis;
     int num_splits;
   };
 
diff --git a/runtime/onert/core/include/ir/operation/Transpose.h b/runtime/onert/core/include/ir/operation/Transpose.h
index 9631f7aaa..665c9bbce 100644
--- a/runtime/onert/core/include/ir/operation/Transpose.h
+++ b/runtime/onert/core/include/ir/operation/Transpose.h
@@ -34,26 +34,15 @@ public:
   enum Input
   {
     INPUT = 0, // for an n-D tensor, specifying the tensor to be transposed.
-  };
-
-  struct Param
-  {
-    std::vector<int> perm;
+    PERMUTATION = 1,
   };
 
 public:
-  Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
-            const Param &param);
+  Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
 
 public:
   void accept(OperationVisitor &v) const override;
   OpCode opcode() const final { return OpCode::Transpose; }
-
-public:
-  const Param &param() const { return _param; }
-
-private:
-  Param _param;
 };
 
 } // namespace operation
diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst
index 5077fad69..30f211011 100644
--- a/runtime/onert/core/include/util/Config.lst
+++ b/runtime/onert/core/include/util/Config.lst
@@ -35,6 +35,7 @@ CONFIG(OP_SEQ_MAX_NODE         , int          , "0")
 CONFIG(TRACE_FILEPATH          , std::string  , "")
 CONFIG(FP16_ENABLE             , bool         , "0")
 CONFIG(RUY_THREADS             , int          , "-1")
+CONFIG(USE_MMAPED_DATA         , bool         , "0")
 
 // Auto-generate all operations
 
diff --git a/runtime/onert/core/include/util/ShapeInference.h b/runtime/onert/core/include/util/ShapeInference.h
index 1ebed48f2..701b835d2 100644
--- a/runtime/onert/core/include/util/ShapeInference.h
+++ b/runtime/onert/core/include/util/ShapeInference.h
@@ -47,7 +47,14 @@ ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank);
 ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape,
                                 const ir::operation::BatchMatMul::Param &param);
 
-ir::Shape inferBroadcastToShape(const ir::Shape wshape, const int32_t *shape_buffer);
+ir::Shape inferBCQFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &cluster_shape,
+                                      const int32_t *cluster_buf);
+
+ir::Shape inferBCQGatherShape(const ir::Shape &indices_shape, const ir::Shape &cluster_shape,
+                              const int32_t *cluster_buf, int rank,
+                              const ir::operation::BCQGather::Param &param);
+
+ir::Shape inferBroadcastToShape(const ir::Shape shp_shape, const int32_t *shp_buf);
 
 ir::Shape inferConcatShape(const Shapes &in_shapes, const ir::operation::Concat::Param &param);
 
@@ -63,7 +70,7 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha
 
 ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis);
 
-ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *buf);
+ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf);
 
 ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &ker_shape);
 
@@ -97,12 +104,12 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp
 ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &input_true_shape,
                            const ir::Shape &input_false_shape);
 
-ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins,
-                          const int32_t *sizes);
+ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
+                          const int32_t *sizes_buf);
 
 ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape,
-                                   const ir::Shape &padding_shape, const int32_t *block_shape_data,
-                                   const int32_t *padding_data);
+                                   const ir::Shape &padding_shape, const int32_t *block_shape_buf,
+                                   const int32_t *padding_buf);
 
 ir::Shape inferSplitShape(const ir::Shape input_shape, int axis_value, int num_splits);
 
@@ -132,9 +139,11 @@ StridedSliceParams buildStridedSliceParams(const T *begin, const T *end, const T
 ir::Shape inferStridedSliceShape(const ir::Shape &input_shape, const StridedSliceParams &op_params,
                                  uint32_t rank);
 
-ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier);
+ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_buf,
+                         const int32_t multiplier_size);
 
-ir::Shape inferTransposeShape(const ir::Shape &in_shape, const std::vector<int> &perm);
+ir::Shape inferTransposeShape(const ir::Shape &in_shape, const int32_t *perm_buf,
+                              const int32_t rank);
 
 ir::Shape inferUnpackShape(const ir::Shape &input_shape, int axis, int rank);
 
diff --git a/runtime/onert/core/include/util/Utils.h b/runtime/onert/core/include/util/Utils.h
index 847fb6971..8a4eea32b 100644
--- a/runtime/onert/core/include/util/Utils.h
+++ b/runtime/onert/core/include/util/Utils.h
@@ -22,6 +22,87 @@
 #ifndef __ONERT_UTIL_UTILS_H__
 #define __ONERT_UTIL_UTILS_H__
 
+#include "ir/Coordinates.h"
+#include "ir/Shape.h"
+
 #define UNUSED_RELEASE(a) (void)(a)
 
+template <size_t from, size_t to, typename Enable = void> struct ForEachDimension
+{
+  template <typename L, typename... Args>
+  static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords,
+                     L &&lambda_function, Args &&... args)
+  {
+    static_assert(from < to, "from must not be less than to");
+    assert(static_cast<int>(to) <= shape.rank());
+    const auto &d = shape.dim(from);
+
+    for (auto v = 0; v < d; v++)
+    {
+      coords.set(from, v);
+      ForEachDimension<from + 1, to>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                             std::forward<Args>(args)...);
+    }
+  }
+};
+
+template <size_t from, size_t to>
+struct ForEachDimension<from, to, typename std::enable_if<from == to>::type>
+{
+  template <typename L, typename... Args>
+  static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords,
+                     L &&lambda_function, Args &&... args)
+  {
+    UNUSED_RELEASE(shape);
+    assert(static_cast<int>(to) <= shape.rank());
+    lambda_function(coords, std::forward<Args>(args)...);
+  }
+};
+
+template <typename L, typename... Args>
+inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args &&... args)
+{
+  assert(shape.rank() > 0);
+  for (auto i = 0; i < shape.rank(); ++i)
+  {
+    assert(shape.dim(i) > 0);
+  }
+
+  onert::ir::Coordinates coords;
+  switch (shape.rank())
+  {
+    case 0:
+      coords.set(0, 0);
+      ForEachDimension<0, 0>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 1:
+      ForEachDimension<0, 1>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 2:
+      ForEachDimension<0, 2>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 3:
+      ForEachDimension<0, 3>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 4:
+      ForEachDimension<0, 4>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 5:
+      ForEachDimension<0, 5>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    case 6:
+      ForEachDimension<0, 6>::unroll(shape, coords, std::forward<L>(lambda_function),
+                                     std::forward<Args>(args)...);
+      break;
+    default:
+      assert(false && "ShapeLoop, 1 <= Shape'rank <= 6");
+      break;
+  }
+}
 #endif // __ONERT_UTIL_UTILS_H__
diff --git a/runtime/libs/ndarray/src/Array.cpp b/runtime/onert/core/src/backend/IPortableTensor.cc
index f9c9de9d3..cec34e780 100644
--- a/runtime/libs/ndarray/src/Array.cpp
+++ b/runtime/onert/core/src/backend/IPortableTensor.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include "ndarray/Array.h"
+#include "backend/IPortableTensor.h"
 
-namespace ndarray
+namespace onert
+{
+namespace backend
 {
 
-template class Array<float>;
-template class Array<int32_t>;
-template class Array<uint32_t>;
-template class Array<uint8_t>;
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+IPortableTensor::~IPortableTensor() {}
 
-} // namespace ndarray
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.h b/runtime/onert/core/src/backend/controlflow/BackendContext.h
new file mode 100644
index 000000000..d179bfde4
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/BackendContext.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include "ExternalContext.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+                 std::shared_ptr<ITensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<IConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<IKernelGenerator> kernel_gen = nullptr,
+                 std::shared_ptr<ITensorRegister> tensor_register = nullptr,
+                 std::shared_ptr<IOptimizer> optimizer = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_registry, tensor_builder,
+                                       constant_initializer, kernel_gen, tensor_register,
+                                       optimizer),
+        _external_context(std::make_shared<ExternalContext>())
+  {
+  }
+
+  std::shared_ptr<ExternalContext> external_context() { return _external_context; }
+
+private:
+  // NOTE ruy context has a thread pool, and when multiple ruy contexts are created,
+  //      the thread pool is also created in duplicate
+  // TODO Create one ruy context for session
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
index 1288e4c96..77f02969d 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
@@ -17,8 +17,7 @@
 #include "DynamicTensorManager.h"
 
 #include "util/logging.h"
-#include "util/Exceptions.h"
-#include "ir/DataType.h"
+#include "misc/polymorphic_downcast.h"
 
 namespace onert
 {
@@ -33,82 +32,18 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry>
   // DO NOTHING
 }
 
-void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
-{
-  // NOTE Handle user tensors first
-  auto user_tensor = _tensors->getNativeUserTensor(ind);
-  if (user_tensor)
-  {
-    // User tensors cannot be reallocated.
-    auto buffer_size = user_tensor->total_size();
-    auto new_size = new_shape.num_elements() * sizeOfDataType(user_tensor->data_type());
-    if (buffer_size < new_size)
-      throw InsufficientBufferSizeException{"Output buffer size is less than output tensor size"};
-    user_tensor->setShape(new_shape);
-    return;
-  }
-
-  // NOTE Then handle own tensors
-  auto tensor = _tensors->getNativeOwnTensor(ind);
-  assert(tensor);
-
-  bool previously_dynamic = tensor->is_dynamic();
-
-  auto allocTensorMem = [&](bool overwrite = false) {
-    auto capacity = tensor->total_size();
-    auto alloc = _dynamic_mem_mgr->allocate(ind, capacity);
-
-    if (overwrite)
-      tensor->overwriteBuffer(alloc);
-    else
-      tensor->setBuffer(alloc);
-  };
-
-  if (!previously_dynamic)
-  {
-    // TODO deallocate tensor->buffer()
-    // issue is that staticTensorManager might have allocate this memory
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem(true);
-  }
-  else if (tensor->buffer() == nullptr)
-  {
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem();
-  }
-  // when buffer was already allocated and new_shape requires different size
-  else
-  {
-    auto previous_size = tensor->total_size();
-    auto new_size = new_shape.num_elements() * sizeOfDataType(tensor->data_type());
-    if (previous_size != new_size)
-    {
-      _dynamic_mem_mgr->deallocate(ind);
-
-      tensor->setShape(new_shape);
-      tensor->set_dynamic();
-      allocTensorMem(true);
-    }
-    else
-    { // when buffer with same size was already allocated, shape could differ
-      tensor->setShape(new_shape);
-    }
-  }
-}
-
 void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
-  auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout, this);
-  _tensors->setNativeOwnTensor(ind, tensor);
+  auto tensor =
+      std::make_unique<cpu_common::Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr.get());
+  _tensors->setNativeOwnTensor(ind, std::move(tensor));
 }
 
-void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor)
 {
-  _dealloc_tensor_map[op_ind].emplace(operand_ind);
+  _dealloc_tensor_map[op_ind].emplace(tensor);
 }
 
 void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
@@ -118,25 +53,26 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
     return;
 
   auto &input_set = find->second;
-  for (auto input_ind : input_set)
+  for (auto *tensor : input_set)
   {
-    if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
+    if (!tensor->is_dynamic())
       continue;
 
-    _dynamic_mem_mgr->deallocate(input_ind);
-    VERBOSE(DynamicTensorManager) << "Deallocating #" << input_ind.value()
+    _dynamic_mem_mgr->deallocate(tensor);
+
+    auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor);
+    cpu_tensor->resetBuffer();
+
+    VERBOSE(DynamicTensorManager) << "Deallocating a tensor " << (void *)tensor
                                   << " (input of op_ind: " << op_ind.value() << ")" << std::endl;
   }
 }
 
-void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind)
 {
-  if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
-    return;
-
-  _dynamic_mem_mgr->deallocate(output_ind);
-  VERBOSE(DynamicTensorManager) << "Deallocating #" << output_ind.value()
-                                << " (output of a subgraph)" << std::endl;
+  auto ptr = _tensors->getITensor(ind);
+  assert(ptr);
+  return ptr;
 }
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
index dbe388ba2..fb822a917 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
@@ -43,14 +43,16 @@ public:
 
   virtual ~DynamicTensorManager() = default;
 
-  void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) override;
-
   void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
                    ir::Layout backend_layout);
 
-  void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) override;
+  void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override;
   void deallocInput(ir::OperationIndex op_ind) override;
-  void deallocSubgraphOutput(ir::OperandIndex ind) override;
+
+  std::shared_ptr<cpu_common::DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; }
+
+private:
+  const ITensor *getRawITensor(ir::OperandIndex ind);
 
 private:
   /**
@@ -60,9 +62,10 @@ private:
   std::shared_ptr<cpu_common::DynamicMemoryManager> _dynamic_mem_mgr;
   const std::shared_ptr<TensorRegistry> _tensors;
 
-  // contains list of dynamic tensor index, which can be deallocated after running operation
-  // note: this map could contain static tensor index too. Careful use is required.
-  std::unordered_map<ir::OperationIndex, std::unordered_set<ir::OperandIndex>> _dealloc_tensor_map;
+  // contains list of dynamic tensor, which can be deallocated after running operation
+  // note: this map could contain static tensor too. Careful use is required.
+  std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>>
+      _dealloc_tensor_map;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/ExternalContext.h b/runtime/onert/core/src/backend/controlflow/ExternalContext.h
new file mode 100644
index 000000000..58bccb6c6
--- /dev/null
+++ b/runtime/onert/core/src/backend/controlflow/ExternalContext.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
+
+#include <backend/IExternalContext.h>
+#include <util/ConfigSource.h>
+#include <ruy/context.h>
+
+namespace
+{
+const int kDefaultNumThreadpoolThreads = 1;
+}
+
+namespace onert
+{
+namespace backend
+{
+namespace controlflow
+{
+
+// TODO Unify this with cpu::ExternalContext
+class ExternalContext : public IExternalContext
+{
+public:
+  ExternalContext() : _ruy_context(nullptr)
+  {
+    // setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
+  }
+
+  void setMaxNumThreads(int max_num_threads)
+  {
+    const int target_num_threads =
+        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+    _ruy_context->set_max_num_threads(target_num_threads);
+  }
+
+  ruy::Context *ruy_context() const { return _ruy_context.get(); }
+
+private:
+  const std::unique_ptr<ruy::Context> _ruy_context;
+};
+
+} // namespace controlflow
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
index de5a6a5f6..d76ca53e3 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
@@ -58,12 +58,10 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq)
     dyn_ctx->op_seq = &op_seq;
     dyn_ctx->operations = &_graph.operations();
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->tensor_registry = _tensor_reg;
     dyn_ctx->dynamic_tensor_manager = _dyn_tensor_manager;
 
     _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
   }
-  _return_fn_seq->enableDynamicShapeInferer(true);
 
   for (const auto &op_idx : op_seq.operations())
   {
@@ -78,7 +76,7 @@ void KernelGenerator::visit(const ir::operation::If &node)
   const auto then_subg_index = node.param().then_subg_index;
   const auto else_subg_index = node.param().else_subg_index;
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+  std::vector<backend::ITensor *> input_tensors;
   for (const auto input_index : node.getInputs())
   {
     auto input_tensor = getTensor(input_index);
@@ -86,14 +84,11 @@ void KernelGenerator::visit(const ir::operation::If &node)
     input_tensors.emplace_back(input_tensor);
   }
 
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
-  exec::DynAllocInfoMap outputs_dyn_alloc_info;
+  std::vector<backend::ITensor *> output_tensors;
   for (const auto output_index : node.getOutputs())
   {
     auto output_tensor = getTensor(output_index);
-
     output_tensors.emplace_back(output_tensor);
-    outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index};
   }
 
   // IfLayer just set ExecutorMap instead of then and else executor to avoid complexity of
@@ -101,8 +96,8 @@ void KernelGenerator::visit(const ir::operation::If &node)
   const auto cond_tensor = input_tensors.front();
   input_tensors.erase(input_tensors.begin());
   auto fn = std::make_unique<::onert::backend::controlflow::kernel::IfLayer>(
-      cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, outputs_dyn_alloc_info,
-      then_subg_index, else_subg_index, _executor_map);
+      cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, then_subg_index,
+      else_subg_index, _executor_map);
 
   _return_fn = std::move(fn);
 }
@@ -113,14 +108,10 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto input_index{node.getInputs().at(0)};
 
   // Add PermuteLayer
-  std::vector<std::shared_ptr<ITensor>> output_tensors{getTensor(output_index)};
-  std::vector<std::shared_ptr<ITensor>> input_tensors{getTensor(input_index)};
-  std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
-  outputs_dyn_alloc_info[output_tensors.at(0)] = exec::DynAllocInfo{output_index};
-
-  auto fn =
-      std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors, outputs_dyn_alloc_info);
+  std::vector<ITensor *> output_tensors{getTensor(output_index)};
+  std::vector<ITensor *> input_tensors{getTensor(input_index)};
 
+  auto fn = std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors);
   _return_fn = std::move(fn);
 }
 
@@ -131,7 +122,7 @@ void KernelGenerator::visit(const ir::operation::While &node)
 
   // This op does not support input as a constant, because controlflow backend does not have
   // TensorBuilder
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+  std::vector<backend::ITensor *> input_tensors;
   for (const auto input_index : node.getInputs())
   {
     auto input_tensor = getTensor(input_index);
@@ -139,29 +130,25 @@ void KernelGenerator::visit(const ir::operation::While &node)
     input_tensors.emplace_back(input_tensor);
   }
 
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
-  std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
+  std::vector<backend::ITensor *> output_tensors;
   for (const auto output_index : node.getOutputs())
   {
     auto output_tensor = getTensor(output_index);
-
     output_tensors.emplace_back(output_tensor);
-
-    outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index};
   }
 
   // WhileLayer just set ExecutorMap instead of cond and body executor to avoid complexity of
   // creating executor recusively
   auto fn = std::make_unique<::onert::backend::controlflow::kernel::WhileLayer>(
-      input_tensors, output_tensors, node.getOutputs(), _graph, outputs_dyn_alloc_info,
-      cond_subg_index, body_subg_index, _executor_map);
+      input_tensors, output_tensors, node.getOutputs(), _graph, cond_subg_index, body_subg_index,
+      _executor_map);
 
   _return_fn = std::move(fn);
 }
 
-std::shared_ptr<backend::ITensor> KernelGenerator::getTensor(const ir::OperandIndex &index)
+backend::ITensor *KernelGenerator::getTensor(const ir::OperandIndex &index)
 {
-  std::shared_ptr<backend::ITensor> ret = _tensor_registries.getITensor(index);
+  backend::ITensor *ret = _tensor_registries.getITensor(index);
   assert(ret != nullptr);
   return ret;
 }
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
index b84a810e4..ce248913f 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
@@ -56,7 +56,7 @@ public:
   void visit(const ir::operation::While &) override;
 
 private:
-  std::shared_ptr<backend::ITensor> getTensor(const ir::OperandIndex &index);
+  backend::ITensor *getTensor(const ir::OperandIndex &index);
 
 private:
   const ir::Graph &_graph;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
index e5c3f5fd5..7d0ff201f 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
@@ -29,8 +29,8 @@ namespace controlflow
 
 TensorBuilder::TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg)
     : _tensor_reg{tensor_reg}, _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg)},
-      _static_tensor_mgr{
-          new cpu_common::StaticTensorManager(_tensor_reg->base_reg(), _dynamic_tensor_mgr.get())}
+      _static_tensor_mgr{new cpu_common::StaticTensorManager(
+          _tensor_reg->base_reg(), _dynamic_tensor_mgr->dynamic_mem_mgr().get())}
 {
   /* empty */
 }
@@ -101,25 +101,14 @@ void TensorBuilder::allocate()
   //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
 }
 
-std::shared_ptr<cpu_common::Tensor> TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind)
+IDynamicTensorManager *TensorBuilder::dynamicTensorManager(void)
 {
-  return _tensor_reg->getNativeOwnTensor(ind);
-}
-
-std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
-{
-  return std::move(_static_tensor_mgr);
+  return _dynamic_tensor_mgr.get();
 }
 
-std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
+cpu_common::Tensor *TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind)
 {
-  return std::move(_dynamic_tensor_mgr);
-}
-
-void TensorBuilder::setNativeUserTensor(const ir::OperandIndex &ind,
-                                        const std::shared_ptr<UserTensor> &tensor)
-{
-  _tensor_reg->setNativeUserTensor(ind, tensor);
+  return _tensor_reg->getNativeOwnTensor(ind);
 }
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
index 2f2a2c47e..695994761 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
@@ -27,7 +27,6 @@
 #include <unordered_map>
 
 #include "DynamicTensorManager.h"
-#include "UserTensorRegistry.h"
 
 namespace onert
 {
@@ -59,20 +58,15 @@ public:
   void allocate() override;
   void postFunctionPrepare() override { /* DO NOTHING */}
 
-  std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override;
-
-  IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); }
-
-  std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) override;
+  IDynamicTensorManager *dynamicTensorManager(void) override;
 
   /**
    * @brief Get tensor with a specific OperandIndex.
    * @param ind OperandIndex for the tensor. There must exist a tensor with this ind.
    *        If not, program will crash with assert or exception.
-   * @return shared_ptr<operand::Tensor>
+   * @return operand::Tensor *
    */
-  std::shared_ptr<cpu_common::Tensor> nativeOwnTensorAt(const ir::OperandIndex &ind);
-  void setNativeUserTensor(const ir::OperandIndex &ind, const std::shared_ptr<UserTensor> &tensor);
+  cpu_common::Tensor *nativeOwnTensorAt(const ir::OperandIndex &ind);
 
 private:
   const std::shared_ptr<TensorRegistry> _tensor_reg;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
index 678c5b73b..94f71bb9c 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
+++ b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
@@ -48,7 +48,7 @@ class TensorRegistry : public ITensorRegistry
 public:
   TensorRegistry() : _base_reg{new cpu_common::TensorRegistry} {}
 
-  std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
+  ITensor *getITensor(const ir::OperandIndex &ind) override
   {
     auto base_tensor = _base_reg->getITensor(ind);
     if (base_tensor)
@@ -56,7 +56,7 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
+  ITensor *getNativeITensor(const ir::OperandIndex &ind) override
   {
     auto base_tensor = _base_reg->getNativeITensor(ind);
     if (base_tensor)
@@ -64,7 +64,7 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
+  IPortableTensor *getPortableTensor(const ir::OperandIndex &ind)
   {
     auto base_tensor = _base_reg->getPortableTensor(ind);
     if (base_tensor)
@@ -72,7 +72,7 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<IPortableTensor> getNativeTensor(const ir::OperandIndex &ind)
+  IPortableTensor *getNativeTensor(const ir::OperandIndex &ind)
   {
     auto base_tensor = _base_reg->getNativeTensor(ind);
     if (base_tensor)
@@ -80,21 +80,20 @@ public:
     return getNativeUserTensor(ind);
   }
 
-  std::shared_ptr<Tensor> getNativeOwnTensor(const ir::OperandIndex &ind)
+  Tensor *getNativeOwnTensor(const ir::OperandIndex &ind)
   {
     return _base_reg->getNativeTensor(ind);
   }
 
-  std::shared_ptr<UserTensor> getNativeUserTensor(const ir::OperandIndex &ind)
+  UserTensor *getNativeUserTensor(const ir::OperandIndex &ind)
   {
     auto tensor = _native_user_tensors.find(ind);
     if (tensor != _native_user_tensors.end())
-      return tensor->second;
+      return tensor->second.get();
     return nullptr;
   }
 
-  bool setMigrantTensor(const ir::OperandIndex &ind,
-                        const std::shared_ptr<IPortableTensor> &tensor) override
+  bool setMigrantTensor(const ir::OperandIndex &ind, IPortableTensor *tensor) override
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
@@ -102,21 +101,21 @@ public:
     return true;
   }
 
-  void setNativeOwnTensor(ir::OperandIndex ind, const std::shared_ptr<Tensor> &tensor)
+  void setNativeOwnTensor(ir::OperandIndex ind, std::unique_ptr<Tensor> &&tensor)
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
-    _base_reg->setNativeTensor(ind, tensor);
+    _base_reg->setNativeTensor(ind, std::move(tensor));
   }
 
-  void setNativeUserTensor(ir::OperandIndex ind, const std::shared_ptr<UserTensor> &tensor)
+  void setNativeUserTensor(ir::OperandIndex ind, std::unique_ptr<UserTensor> &&tensor)
   {
     assert(tensor);
     assert(!getITensor(ind)); // For the ind, tensor is not registered yet
-    _native_user_tensors[ind] = tensor;
+    _native_user_tensors[ind] = std::move(tensor);
   }
 
-  const ir::OperandIndexMap<std::shared_ptr<UserTensor>> &native_user_tensors()
+  const ir::OperandIndexMap<std::unique_ptr<UserTensor>> &native_user_tensors()
   {
     return _native_user_tensors;
   }
@@ -124,7 +123,7 @@ public:
 
 private:
   std::shared_ptr<cpu_common::TensorRegistry> _base_reg;
-  ir::OperandIndexMap<std::shared_ptr<UserTensor>> _native_user_tensors;
+  ir::OperandIndexMap<std::unique_ptr<UserTensor>> _native_user_tensors;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.cc b/runtime/onert/core/src/backend/controlflow/UserTensor.cc
index c8e2ebade..5081a90ea 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.cc
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.cc
@@ -16,6 +16,9 @@
 
 #include "UserTensor.h"
 
+#include "util/Exceptions.h"
+#include "ir/DataType.h"
+
 namespace onert
 {
 namespace backend
@@ -35,6 +38,16 @@ size_t UserTensor::calcOffset(const ir::Coordinates &coords) const
   return offset;
 }
 
+bool UserTensor::applyShape(const ir::Shape &new_shape)
+{
+  // User tensors cannot be reallocated.
+  auto new_size = new_shape.num_elements() * ir::sizeOfDataType(data_type());
+  if (total_size() < new_size)
+    throw InsufficientBufferSizeException{"User given buffer size is too small."};
+  setShape(new_shape);
+  return true;
+}
+
 } // namespace controlflow
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
index 9be33595d..7aa62a8a9 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
@@ -38,16 +38,12 @@ namespace controlflow
 class UserTensor : public IPortableTensor
 {
 public:
-  UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size,
-             IDynamicTensorManager *dynamic_tensor_manager)
-      : _info{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false},
-        _dynamic_tensor_manager{dynamic_tensor_manager}
+  UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size)
+      : IPortableTensor{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false}
   {
   }
 
-  UserTensor(const ir::OperandInfo &info, ir::Layout layout,
-             IDynamicTensorManager *dynamic_tensor_manager)
-      : UserTensor{info, layout, nullptr, 0, dynamic_tensor_manager}
+  UserTensor(const ir::OperandInfo &info, ir::Layout layout) : UserTensor{info, layout, nullptr, 0}
   {
   }
 
@@ -73,15 +69,13 @@ public:
   ir::Shape getShape() const override { return _info.shape(); }
   void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
   bool is_constant() const override { return false; }
-  IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; }
+  bool applyShape(const ir::Shape &) override;
 
 private:
-  ir::OperandInfo _info;
   ir::Layout _layout;
   uint8_t *_buffer;
   size_t _size;
   bool _dynamic;
-  IDynamicTensorManager *_dynamic_tensor_manager;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
index 8377c7183..c0329acd8 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
@@ -30,16 +30,13 @@ namespace controlflow
 namespace kernel
 {
 
-IfLayer::IfLayer(const std::shared_ptr<backend::ITensor> &cond_tensor,
-                 const std::vector<std::shared_ptr<backend::ITensor>> input_tensors,
-                 const std::vector<std::shared_ptr<backend::ITensor>> output_tensors,
+IfLayer::IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors,
+                 const std::vector<backend::ITensor *> output_tensors,
                  const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-                 const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
                  const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
                  exec::ExecutorMap *executor_map)
     : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors},
-      _output_indices{output_indices}, _graph{graph},
-      _outputs_dyn_alloc_info{outputs_dyn_alloc_info}, _then_subg_index{then_subg_index},
+      _output_indices{output_indices}, _graph{graph}, _then_subg_index{then_subg_index},
       _else_subg_index{else_subg_index}, _executor_map{executor_map}
 {
   // At this point, executor_map may not have executors of then subg and else subg
@@ -63,21 +60,24 @@ void IfLayer::run()
   };
 
   exec::ExecutorBase *subg_exec = nullptr;
-  if (getResultCond(_cond_tensor.get()))
+  bool cond_result = getResultCond(_cond_tensor);
+  if (cond_result)
   {
+    VERBOSE(If) << "Call to $" << _then_subg_index << " (then)" << std::endl;
     subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
         _executor_map->at(_then_subg_index).get());
   }
   else
   {
+    VERBOSE(If) << "Call to $" << _else_subg_index << " (else)" << std::endl;
     subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>(
         _executor_map->at(_else_subg_index).get());
   }
 
   const auto &subg_graph = subg_exec->graph();
 
-  std::vector<std::shared_ptr<backend::ITensor>> src_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> dst_tensors;
+  std::vector<backend::ITensor *> src_tensors;
+  std::vector<backend::ITensor *> dst_tensors;
   // Add tensors used in subgraph or contained in outputs of subgraph
   assert(subg_graph.getInputs().size() == _input_tensors.size());
   assert(subg_graph.getInputs().size() == subg_exec->getInputTensors().size());
@@ -91,9 +91,8 @@ void IfLayer::run()
       dst_tensors.emplace_back(subg_exec->getInputTensors().at(i));
     }
   }
-  const auto &subg_inputs_dyn_alloc_info = subg_exec->getInputsDynamicAllocInfo();
   const auto permute_op_input_to_subg_input =
-      std::make_shared<PermuteLayer>(src_tensors, dst_tensors, subg_inputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(src_tensors, dst_tensors);
 
   // Add tensors used as output of operation or contained in outputs of operation
   src_tensors.clear();
@@ -111,7 +110,7 @@ void IfLayer::run()
     }
   }
   const auto permute_subg_output_to_op_output =
-      std::make_shared<PermuteLayer>(src_tensors, dst_tensors, _outputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(src_tensors, dst_tensors);
 
   // Remove copying of unused tensor
   permute_op_input_to_subg_input->prepare();
@@ -120,6 +119,8 @@ void IfLayer::run()
   // Copy & run
   subg_exec->execute(_input_tensors, permute_op_input_to_subg_input);
   permute_subg_output_to_op_output->run();
+  VERBOSE(If) << "Return from $" << (cond_result ? _then_subg_index : _else_subg_index)
+              << std::endl;
 }
 
 } // namespace kernel
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
index ef3a6e6f6..1461388dc 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
@@ -32,11 +32,9 @@ namespace kernel
 class IfLayer : public ::onert::exec::IFunction
 {
 public:
-  IfLayer(const std::shared_ptr<backend::ITensor> &cond_tensor,
-          const std::vector<std::shared_ptr<backend::ITensor>> input_tensors,
-          const std::vector<std::shared_ptr<backend::ITensor>> output_tensors,
+  IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors,
+          const std::vector<backend::ITensor *> output_tensors,
           const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-          const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
           const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
           exec::ExecutorMap *executor_map);
 
@@ -44,12 +42,11 @@ public:
   void run() override;
 
 private:
-  const std::shared_ptr<backend::ITensor> _cond_tensor;
-  const std::vector<std::shared_ptr<backend::ITensor>> _input_tensors;
-  const std::vector<std::shared_ptr<backend::ITensor>> _output_tensors;
+  backend::ITensor *_cond_tensor;
+  const std::vector<backend::ITensor *> _input_tensors;
+  const std::vector<backend::ITensor *> _output_tensors;
   const ir::OperandIndexSequence &_output_indices;
   const ir::Graph &_graph;
-  const exec::DynAllocInfoMap _outputs_dyn_alloc_info;
   const ir::SubgraphIndex _then_subg_index;
   const ir::SubgraphIndex _else_subg_index;
   exec::ExecutorMap *_executor_map;
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
index e8f1ea679..49fbb33c4 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
@@ -54,12 +54,9 @@ void PermuteLayer::run()
 
       try
       {
-        const auto dst_index = _dst_dyn_alloc_info_map.at(dst_tensor).ind;
-        auto dyn_tensor_manager = dst_tensor->dynamic_tensor_manager();
-        if (!dyn_tensor_manager)
+        if (!dst_tensor->applyShape(new_shape))
           throw std::runtime_error{
               "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};
-        dyn_tensor_manager->applyShape(dst_index, new_shape);
         assert(dst_tensor->buffer() != nullptr);
       }
       catch (const std::out_of_range &e)
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
index 403ac770d..8129403a5 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
@@ -33,10 +33,7 @@ namespace kernel
 class PermuteLayer : public onert::exec::IPermuteFunction
 {
 public:
-  PermuteLayer(const std::vector<std::shared_ptr<ITensor>> &src_tensors,
-               const std::vector<std::shared_ptr<ITensor>> &dst_tensors,
-               const exec::DynAllocInfoMap &dst_dyn_alloc_info_map)
-      : _dst_dyn_alloc_info_map{dst_dyn_alloc_info_map}
+  PermuteLayer(const std::vector<ITensor *> &src_tensors, const std::vector<ITensor *> &dst_tensors)
   {
     assert(src_tensors.size() == dst_tensors.size());
     _src_tensors = src_tensors;
@@ -64,9 +61,6 @@ public:
   }
 
   void run() override;
-
-private:
-  const exec::DynAllocInfoMap _dst_dyn_alloc_info_map;
 };
 
 } // namespace kernel
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
index 50936e5f6..225f0dd7c 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
+++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
@@ -30,16 +30,14 @@ namespace controlflow
 namespace kernel
 {
 
-WhileLayer::WhileLayer(const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                       const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
+WhileLayer::WhileLayer(const std::vector<backend::ITensor *> input_tensors,
+                       const std::vector<backend::ITensor *> output_tensors,
                        const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-                       const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
                        const ir::SubgraphIndex &cond_subg_index,
                        const ir::SubgraphIndex &body_subg_index, exec::ExecutorMap *executor_map)
     : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index},
       _output_indices{output_indices}, _graph{graph}, _input_tensors{input_tensors},
-      _output_tensors{output_tensors}, _outputs_dyn_alloc_info{outputs_dyn_alloc_info},
-      _executor_map{executor_map}
+      _output_tensors{output_tensors}, _executor_map{executor_map}
 {
   // At this point, executor_map may not have executors of cond subg and body subg
 }
@@ -62,15 +60,13 @@ void WhileLayer::run()
       _executor_map->at(_body_subg_index).get());
 
   const auto &cond_graph = cond_exec->graph();
-  const auto &cond_inputs_dyn_alloc = cond_exec->getInputsDynamicAllocInfo();
   const auto &body_graph = body_exec->graph();
-  const auto &body_inputs_dyn_alloc = body_exec->getInputsDynamicAllocInfo();
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> cond_input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> body_input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> body_output_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::ITensor *> cond_input_tensors;
+  std::vector<backend::ITensor *> body_input_tensors;
+  std::vector<backend::ITensor *> body_output_tensors;
+  std::vector<backend::ITensor *> output_tensors;
 
   // Add only used tensors in cond subgraph
   assert(cond_graph.getInputs().size() == _input_tensors.size());
@@ -85,7 +81,7 @@ void WhileLayer::run()
     }
   }
   const auto permute_op_input_to_cond_input =
-      std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors, cond_inputs_dyn_alloc);
+      std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors);
 
   // Add only used tensors among outputs of while operation
   assert(_output_indices.size() == _input_tensors.size());
@@ -103,7 +99,7 @@ void WhileLayer::run()
     }
   }
   const auto permute_op_input_to_op_output =
-      std::make_shared<PermuteLayer>(input_tensors, output_tensors, _outputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(input_tensors, output_tensors);
 
   // Add all tensors with unused tensors in body subgraph because unused input tensors will be
   // copied output tensors in body subgraph
@@ -111,7 +107,7 @@ void WhileLayer::run()
   input_tensors = _input_tensors;
   body_input_tensors = body_exec->getInputTensors();
   const auto permute_op_input_to_body_input =
-      std::make_shared<PermuteLayer>(input_tensors, body_input_tensors, body_inputs_dyn_alloc);
+      std::make_shared<PermuteLayer>(input_tensors, body_input_tensors);
 
   // Add only used tensors in cond subgraph
   assert(cond_graph.getInputs().size() == body_exec->getOutputTensors().size());
@@ -127,8 +123,8 @@ void WhileLayer::run()
       cond_input_tensors.emplace_back(cond_exec->getInputTensors().at(i));
     }
   }
-  const auto permute_body_output_to_cond_input = std::make_shared<PermuteLayer>(
-      body_output_tensors, cond_input_tensors, cond_inputs_dyn_alloc);
+  const auto permute_body_output_to_cond_input =
+      std::make_shared<PermuteLayer>(body_output_tensors, cond_input_tensors);
 
   // Add only used tensors in body subgraph
   assert(body_graph.getInputs().size() == body_exec->getOutputTensors().size());
@@ -146,8 +142,8 @@ void WhileLayer::run()
       body_input_tensors.emplace_back(body_exec->getInputTensors().at(i));
     }
   }
-  const auto permute_body_output_to_body_input = std::make_shared<PermuteLayer>(
-      body_output_tensors, body_input_tensors, body_inputs_dyn_alloc);
+  const auto permute_body_output_to_body_input =
+      std::make_shared<PermuteLayer>(body_output_tensors, body_input_tensors);
 
   // Add only used tensors among outputs of while operation
   assert(_output_indices.size() == body_exec->getOutputTensors().size());
@@ -165,7 +161,7 @@ void WhileLayer::run()
     }
   }
   const auto permute_body_output_to_op_output =
-      std::make_shared<PermuteLayer>(body_output_tensors, output_tensors, _outputs_dyn_alloc_info);
+      std::make_shared<PermuteLayer>(body_output_tensors, output_tensors);
 
   // Remove copying of unused tensor
   permute_op_input_to_cond_input->prepare();
@@ -175,7 +171,9 @@ void WhileLayer::run()
   permute_body_output_to_body_input->prepare();
   permute_body_output_to_op_output->prepare();
 
+  VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl;
   cond_exec->execute(_input_tensors, permute_op_input_to_cond_input);
+  VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl;
 
   assert(cond_exec->getOutputTensors().size() == 1);
   auto &cond_output_tensor = cond_exec->getOutputTensors().at(0);
@@ -186,21 +184,27 @@ void WhileLayer::run()
   };
 
   const auto body_execute_with_op_inputs = [&]() {
+    VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl;
     body_exec->execute(_input_tensors, permute_op_input_to_body_input);
+    VERBOSE(While) << "Return from $" << _body_subg_index << std::endl;
   };
 
   const auto body_execute_with_body_outputs = [&]() {
+    VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl;
     body_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_body_input);
+    VERBOSE(While) << "Return from $" << _body_subg_index << std::endl;
   };
 
   std::function<void()> body_execute = body_execute_with_op_inputs;
   const auto cond_execute = [&]() {
+    VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl;
     cond_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_cond_input);
+    VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl;
   };
   auto permute_to_outputs_fn = permute_op_input_to_op_output;
 
   // Loop while Cond subgraph's output is true
-  while (getResultCond(cond_output_tensor.get()))
+  while (getResultCond(cond_output_tensor))
   {
     body_execute();
     cond_execute();
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
index ebca8acdc..9dae49281 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
+++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
@@ -35,10 +35,9 @@ namespace kernel
 class WhileLayer : public ::onert::exec::IFunction
 {
 public:
-  WhileLayer(const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-             const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
+  WhileLayer(const std::vector<backend::ITensor *> input_tensors,
+             const std::vector<backend::ITensor *> output_tensors,
              const ir::OperandIndexSequence &output_indices, const ir::Graph &graph,
-             const exec::DynAllocInfoMap &outputs_dyn_alloc_info,
              const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index,
              exec::ExecutorMap *executor_map);
 
@@ -50,9 +49,8 @@ private:
   const ir::SubgraphIndex _body_subg_index;
   const ir::OperandIndexSequence &_output_indices;
   const ir::Graph &_graph;
-  const std::vector<std::shared_ptr<backend::ITensor>> _input_tensors;
-  const std::vector<std::shared_ptr<backend::ITensor>> _output_tensors;
-  const exec::DynAllocInfoMap _outputs_dyn_alloc_info;
+  const std::vector<backend::ITensor *> _input_tensors;
+  const std::vector<backend::ITensor *> _output_tensors;
   exec::ExecutorMap *_executor_map;
 };
 
diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
index f7ce3d011..740248ccd 100644
--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
@@ -17,6 +17,7 @@
 #include "backend/cpu_common/DynamicTensorManager.h"
 
 #include "util/logging.h"
+#include "misc/polymorphic_downcast.h"
 
 namespace onert
 {
@@ -31,71 +32,18 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry>
   // DO NOTHING
 }
 
-void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
-{
-  VERBOSE_F() << ind << std::endl;
-
-  auto tensor = _tensors->getNativeTensor(ind);
-  assert(tensor);
-
-  bool previously_dynamic = tensor->is_dynamic();
-
-  auto allocTensorMem = [&](bool overwrite = false) {
-    auto capacity = tensor->total_size();
-    auto alloc = _dynamic_mem_mgr->allocate(ind, capacity);
-
-    if (overwrite)
-      tensor->overwriteBuffer(alloc);
-    else
-      tensor->setBuffer(alloc);
-  };
-
-  if (!previously_dynamic)
-  {
-    // TODO deallocate tensor->buffer()
-    // issue is that staticTensorManager might have allocate this memory
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem(true);
-  }
-  else if (tensor->buffer() == nullptr)
-  {
-    tensor->setShape(new_shape);
-    tensor->set_dynamic();
-    allocTensorMem();
-  }
-  // when buffer was already allocated and new_shape requires different size
-  else
-  {
-    auto previous_size = tensor->total_size();
-    auto new_size = new_shape.num_elements() * sizeOfDataType(tensor->data_type());
-    if (previous_size != new_size)
-    {
-      _dynamic_mem_mgr->deallocate(ind);
-
-      tensor->setShape(new_shape);
-      tensor->set_dynamic();
-      allocTensorMem(true);
-    }
-    else
-    { // when buffer with same size was already allocated, shape could differ
-      tensor->setShape(new_shape);
-    }
-  }
-}
-
 void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
   assert(_tensors->getNativeTensor(ind) == nullptr);
-  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, this);
-  _tensors->setNativeTensor(ind, tensor);
+  auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr.get());
+  _tensors->setNativeTensor(ind, std::move(tensor));
 }
 
-void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor)
 {
-  _dealloc_tensor_map[op_ind].emplace(operand_ind);
+  _dealloc_tensor_map[op_ind].emplace(tensor);
 }
 
 void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
@@ -105,31 +53,26 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
     return;
 
   auto &input_set = find->second;
-  for (auto input_ind : input_set)
+  for (auto *tensor : input_set)
   {
-    auto *tensor = _tensors->getNativeTensor(input_ind).get();
     if (!tensor->is_dynamic())
       continue;
 
-    _dynamic_mem_mgr->deallocate(input_ind);
-    tensor->resetBuffer();
+    _dynamic_mem_mgr->deallocate(tensor);
 
-    VERBOSE(DynamicTensorManager) << "Deallocating #" << input_ind.value()
+    auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor);
+    cpu_tensor->resetBuffer();
+
+    VERBOSE(DynamicTensorManager) << "Deallocating tensor " << (void *)cpu_tensor
                                   << " (input of op_ind: " << op_ind.value() << ")" << std::endl;
   }
 }
 
-void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind)
 {
-  auto *tensor = _tensors->getNativeTensor(output_ind).get();
-  if (!tensor->is_dynamic())
-    return;
-
-  _dynamic_mem_mgr->deallocate(output_ind);
-  tensor->resetBuffer();
-
-  VERBOSE(DynamicTensorManager) << "Deallocating #" << output_ind.value()
-                                << " (output of a subgraph)" << std::endl;
+  auto ptr = _tensors->getITensor(ind);
+  assert(ptr);
+  return ptr;
 }
 
 } // namespace cpu_common
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc b/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
index 8cb9c22ca..9f179d9ee 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
@@ -20,6 +20,7 @@
 
 #include "MemoryPlannerFactory.h"
 #include "util/ConfigSource.h"
+#include "util/logging.h"
 
 namespace onert
 {
@@ -70,20 +71,20 @@ uint8_t *MemoryManager::getBuffer(const ir::OperandIndex &ind) const
   return _mem_alloc->base() + mem_blk.offset;
 }
 
-std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ir::OperandIndex &ind,
+std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ITensor *tensor,
                                                                       uint32_t capacity)
 {
-  auto find = _mem_alloc_map.find(ind);
+  auto find = _mem_alloc_map.find(tensor);
   if (find != _mem_alloc_map.end())
     throw std::runtime_error("Cannot allocate memory for a tensor. It was already allocated.");
 
-  _mem_alloc_map[ind] = std::make_shared<cpu_common::Allocator>(capacity);
-  return _mem_alloc_map[ind];
+  _mem_alloc_map[tensor] = std::make_shared<cpu_common::Allocator>(capacity);
+  return _mem_alloc_map[tensor];
 }
 
-void DynamicMemoryManager::deallocate(const ir::OperandIndex &ind)
+void DynamicMemoryManager::deallocate(const ITensor *tensor)
 {
-  auto find = _mem_alloc_map.find(ind);
+  auto find = _mem_alloc_map.find(tensor);
   if (find == _mem_alloc_map.end())
     throw std::runtime_error("Cannot find Allocator for the requested index");
 
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
index 440f70c93..cac43babe 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
@@ -27,9 +27,9 @@ namespace cpu_common
 {
 
 StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
-                                         IDynamicTensorManager *dynamic_tensor_manager)
+                                         DynamicMemoryManager *dynamic_mem_mgr)
     : _const_mgr{new DynamicMemoryManager()}, _nonconst_mgr{new MemoryManager()}, _tensors{reg},
-      _dynamic_tensor_manager{dynamic_tensor_manager}
+      _dynamic_mem_mgr{dynamic_mem_mgr}
 {
   // DO NOTHING
 }
@@ -39,10 +39,10 @@ void StaticTensorManager::allocateConsts(void)
   for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
-    auto tensor = pair.second;
+    auto tensor = pair.second.get();
     if (_as_constants[ind])
     {
-      auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
+      auto mem_alloc = _const_mgr->allocate(_tensors->getITensor(ind), tensor->total_size());
       tensor->setBuffer(mem_alloc);
       auto buffer = mem_alloc->base();
       VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
@@ -59,7 +59,7 @@ void StaticTensorManager::allocateNonconsts(void)
   for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
-    auto tensor = pair.second;
+    auto tensor = pair.second.get();
     if (!_as_constants[ind] && !tensor->is_dynamic())
     {
       auto *buffer = _nonconst_mgr->getBuffer(ind);
@@ -80,8 +80,8 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
                                       bool as_const)
 {
   assert(!_tensors->getNativeTensor(ind));
-  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager);
-  _tensors->setNativeTensor(ind, tensor);
+  auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr);
+  _tensors->setNativeTensor(ind, std::move(tensor));
   _as_constants[ind] = as_const;
 }
 
diff --git a/runtime/onert/core/src/backend/cpu_common/Tensor.cc b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
index f34564dd9..d3dcf9a6d 100644
--- a/runtime/onert/core/src/backend/cpu_common/Tensor.cc
+++ b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
@@ -16,6 +16,9 @@
 
 #include "backend/cpu_common/Tensor.h"
 
+#include "ir/DataType.h"
+#include "backend/cpu_common/MemoryManager.h"
+
 namespace onert
 {
 namespace backend
@@ -23,6 +26,8 @@ namespace backend
 namespace cpu_common
 {
 
+Tensor::~Tensor() {}
+
 size_t Tensor::calcOffset(const ir::Coordinates &coords) const
 {
   size_t rank = num_dimensions();
@@ -38,6 +43,55 @@ size_t Tensor::calcOffset(const ir::Coordinates &coords) const
 
 void Tensor::setShape(const ir::Shape &new_shape) { _info.shape(new_shape); }
 
+bool Tensor::applyShape(const ir::Shape &new_shape)
+{
+  bool previously_dynamic = is_dynamic();
+
+  auto allocTensorMem = [&](bool overwrite = false) {
+    auto capacity = total_size();
+    auto alloc = _dynamic_mem_mgr->allocate(this, capacity);
+
+    if (overwrite)
+      overwriteBuffer(alloc);
+    else
+      setBuffer(alloc);
+  };
+
+  if (!previously_dynamic)
+  {
+    // TODO deallocate tensor->buffer()
+    // issue is that staticTensorManager might have allocate this memory
+    setShape(new_shape);
+    set_dynamic();
+    allocTensorMem(true);
+  }
+  else if (buffer() == nullptr)
+  {
+    setShape(new_shape);
+    set_dynamic();
+    allocTensorMem();
+  }
+  // when buffer was already allocated and new_shape requires different size
+  else
+  {
+    auto previous_size = total_size();
+    auto new_size = new_shape.num_elements() * ir::sizeOfDataType(data_type());
+    if (previous_size != new_size)
+    {
+      _dynamic_mem_mgr->deallocate(this);
+
+      setShape(new_shape);
+      set_dynamic();
+      allocTensorMem(true);
+    }
+    else
+    { // when buffer with same size was already allocated, shape could differ
+      setShape(new_shape);
+    }
+  }
+  return true;
+}
+
 } // namespace cpu_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/BackendManager.cc b/runtime/onert/core/src/compiler/BackendManager.cc
index db7a14a96..0093f50fd 100644
--- a/runtime/onert/core/src/compiler/BackendManager.cc
+++ b/runtime/onert/core/src/compiler/BackendManager.cc
@@ -70,31 +70,18 @@ void BackendManager::loadBackend(const std::string &backend)
   }
 
   // TODO Remove indentation
-  // Workaround If backend have dynamic library with "-boost" suffix naming,
-  //            BackendManager load library with "-boost" suffix instead of library without suffix
-  //            This feature is used for custom backend extension to support additional operations
   {
-    const std::string backend_boost_so = "libbackend_" + backend + "-boost" + SHARED_LIB_EXT;
     const std::string backend_so = "libbackend_" + backend + SHARED_LIB_EXT;
+    void *handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
 
-    void *handle = dlopen(backend_boost_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
     if (handle == nullptr)
     {
-      handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
-
-      if (handle == nullptr)
-      {
-        VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl;
-        return;
-      }
-
-      VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n";
-    }
-    else
-    {
-      VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_boost_so << "\n";
+      VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl;
+      return;
     }
 
+    VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n";
+
     {
       // load object creator function
       auto backend_create = (backend_create_t)dlsym(handle, "onert_backend_create");
diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc
index 93dbbc3b5..12b582b35 100644
--- a/runtime/onert/core/src/compiler/Compiler.cc
+++ b/runtime/onert/core/src/compiler/Compiler.cc
@@ -19,6 +19,7 @@
 #include "ParamChecker.h"
 #include "ExecutorFactory.h"
 #include "OperationValidator.h"
+#include "ShapeValidator.h"
 #include "Fp32ToFp16Converter.h"
 
 #include <backend/controlflow/Config.h>
@@ -27,8 +28,12 @@
 #include "compiler/ManualScheduler.h"
 #include "compiler/HEScheduler.h"
 #include "compiler/StaticShapeInference.h"
+#include "compiler/pass/ConstantOutputPass.h"
+#include "compiler/pass/OddOutputPass.h"
+#include "compiler/pass/PassRunner.h"
 #include "exec/ExecTime.h"
 #include "ir/operation/LowerInfo.h"
+#include "ir/verifier/Verifier.h"
 #include "dumper/dot/DotDumper.h"
 #include "compiler/Linear.h"
 #include "interp/InterpExecutor.h"
@@ -132,6 +137,8 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
         backend::controlflow::Config::ID;
     _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] =
         backend::controlflow::Config::ID;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] =
+        backend::controlflow::Config::ID;
   }
 
   // FIXME This is a workaround for bcq operations, should remove it
@@ -159,10 +166,24 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     VERBOSE(Compiler) << std::noboolalpha;
   }
 
+  _subgraphs->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
+    // Mandatory passes
+    pass::PassRunner{}
+        .append(std::make_unique<pass::ConstantOutputPass>(subg))
+        .append(std::make_unique<pass::OddOutputPass>(subg))
+        .run();
+  });
+
   /***************************************************
    * Prepare compilation phase
    ***************************************************/
 
+  // Check shape independent operation feature
+  // - Operand type
+  // - Shape independent parameter
+  _subgraphs->iterate(
+      [](const onert::ir::SubgraphIndex &, const ir::Graph &subg) { OperationValidator{subg}(); });
+
   auto executors = std::make_shared<exec::ExecutorMap>();
 
   // Compilable check
@@ -229,17 +250,23 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     inferer.dump();
   }
 
-  /*************************************************************
-   *  Backend independent analysis & optimization phase finished
-   *************************************************************/
-
-  // operation validation
+  // Shape validation
+  // TODO Move shape independent feature check from ShapeValidator to OperationValidator
+  // TODO Move ShapeValidator into shape inference
+  //      - Check input tensor shape validation
+  //      - Check parameter value validation which valid value is depend on input tensor shape
+  //      - Output tensor shape validation check is needless because
+  //        static/dynamic shape inferer will make valid output shape
   for (auto &pair : lowered_subgs)
   {
     auto &lowered_subg = pair.second;
-    compiler::OperationValidator{lowered_subg->graph()}();
+    compiler::ShapeValidator{lowered_subg->graph()}();
   }
 
+  /*************************************************************
+   *  Backend independent analysis & optimization phase finished
+   *************************************************************/
+
   executors = std::make_shared<exec::ExecutorMap>();
   for (auto &pair : lowered_subgs)
   {
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index 062c6c9c3..bb325ffbc 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -29,6 +29,7 @@
 #include "backend/IConstantInitializer.h"
 #include "backend/IKernelGenerator.h"
 #include "backend/IOptimizer.h"
+#include "backend/IPortableTensor.h"
 #include "backend/ITensorRegister.h"
 #include "backend/controlflow/Config.h"
 #include "backend/controlflow/KernelGenerator.h"
@@ -65,23 +66,6 @@ private:
   std::shared_ptr<backend::IConfig> _config;
 };
 
-// TODO Think of a better way to manage TensorManagers
-backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders)
-{
-  backend::TensorManagerSet tensor_mgrs;
-  for (auto &tensor_builder : tensor_builders)
-  {
-    auto s_tensor_manager = tensor_builder->releaseStaticTensorManager();
-    if (s_tensor_manager != nullptr)
-      tensor_mgrs.insert(std::move(s_tensor_manager));
-
-    auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager();
-    if (d_tensor_manager != nullptr)
-      tensor_mgrs.insert(std::move(d_tensor_manager));
-  }
-  return tensor_mgrs;
-}
-
 } // namespace
 } // namespace onert
 
@@ -172,7 +156,8 @@ void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_grap
       for (const auto op_idx : op_seq)
       {
         const auto &op = lowered_graph->graph().operations().at(op_idx);
-        for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
+        for (const auto &index :
+             (op.getInputs() | ir::Remove::UNDEFINED) + (op.getOutputs() | ir::Remove::UNDEFINED))
         {
           if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
           {
@@ -200,11 +185,11 @@ void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_grap
   }
 }
 
-std::vector<std::shared_ptr<backend::ITensor>>
+std::vector<backend::ITensor *>
 ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
                                           const ir::OperandIndexSequence &indices)
 {
-  std::vector<std::shared_ptr<backend::ITensor>> ret;
+  std::vector<backend::ITensor *> ret;
 
   // TODO Store controlflow backend in BackendContext
   std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
@@ -227,19 +212,20 @@ ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
   for (auto ind : indices)
   {
     const auto &operand = lowered_graph.graph().operands().at(ind);
-    auto tensor = std::make_shared<backend::controlflow::UserTensor>(
+    auto tensor = std::make_unique<backend::controlflow::UserTensor>(
         operand.info(),
-        ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
-        cf_tensor_builder->dynamicTensorManager());
+        ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */
+        );
 
     // Add tensor to controlflow TensorRegistry.
-    cf_tensor_reg->setNativeUserTensor(ind, tensor);
-    ret.push_back(tensor);
+    cf_tensor_reg->setNativeUserTensor(ind, std::move(tensor));
+    auto *itensor = cf_tensor_reg->getITensor(ind);
+    ret.push_back(itensor);
   }
   return ret;
 }
 
-void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph)
+void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph)
 {
   TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
 
@@ -251,13 +237,13 @@ void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_gra
                             ir::Remove::UNDEFINED)
         {
           // If an OpSequence input/output tensor does not have a own tensor object,
-          // it must be using external tensors, so find the tensor from other tensor builders and
+          // it must be using migrant tensors, so find the tensor from other tensor builders and
           // set the tensor to this tensor builder if portable
           if (!backend_ctx->tensor_registry->getITensor(ind))
           {
             auto tensor = tensor_regs.getITensor(ind);
             assert(tensor); // The tensor must have been registered
-            auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
+            auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
             if (ptensor)
               backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
           }
@@ -299,8 +285,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
   auto order = Linear::linearize(*lowered_graph);
   runTensorRegistration(lowered_graph.get(), order);
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::ITensor *> output_tensors;
   if (options.is_primary_subgraph)
   {
     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
@@ -318,7 +304,7 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
     tensor_builder->prepare();
   }
 
-  prepareExternalTensors(*lowered_graph);
+  prepareMigrantTensors(*lowered_graph);
 
   ExecutionBuilder builder;
 
@@ -370,10 +356,9 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
     });
   }
 
-  backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
-  auto exec = new exec::LinearExecutor{
-      std::move(lowered_graph), input_tensors,       output_tensors, tensor_regs,
-      std::move(tensor_mgrs),   std::move(code_map), order};
+  auto exec =
+      new exec::LinearExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
+                               std::move(code_map),      order};
 
   if (!options.trace_filepath.empty())
   {
@@ -396,8 +381,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   auto order = Linear::linearize(*lowered_graph);
   runTensorRegistration(lowered_graph.get(), order);
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+  std::vector<backend::ITensor *> input_tensors;
+  std::vector<backend::ITensor *> output_tensors;
   if (options.is_primary_subgraph)
   {
     input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
@@ -424,7 +409,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     tensor_builder->prepare();
   }
 
-  prepareExternalTensors(*lowered_graph);
+  prepareMigrantTensors(*lowered_graph);
 
   ExecutionBuilder builder;
 
@@ -477,20 +462,16 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     });
   }
 
-  backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
-
   exec::ExecutorBase *exec = nullptr;
   if (parallel)
   {
-    exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors,
-                                      output_tensors,           tensor_regs,
-                                      std::move(tensor_mgrs),   std::move(code_map)};
+    exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors,
+                                      tensor_regs, std::move(code_map)};
   }
   else
   {
-    auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors,
-                                                    output_tensors,           tensor_regs,
-                                                    std::move(tensor_mgrs),   std::move(code_map)};
+    auto dataflow_exec = new exec::DataflowExecutor{
+        std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, std::move(code_map)};
     if (options.he_profiling_mode)
     {
       std::vector<const backend::Backend *> backends;
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index b8893c03b..e76b721ea 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -46,10 +46,10 @@ private:
   static void initializeBackendContext(compiler::LoweredGraph *lowered_graph);
   static void runTensorRegistration(compiler::LoweredGraph *lowered_graph,
                                     const std::vector<ir::OpSequenceIndex> &order);
-  static std::vector<std::shared_ptr<backend::ITensor>>
+  static std::vector<backend::ITensor *>
   initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
                            const ir::OperandIndexSequence &indices);
-  static void prepareExternalTensors(compiler::LoweredGraph &lowered_graph);
+  static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph);
   static exec::IExecutor *
   createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
                        const compiler::CompilerOptions &options,
diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc
index 5653b090e..fe54b0fdd 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.cc
+++ b/runtime/onert/core/src/compiler/HEScheduler.cc
@@ -34,7 +34,8 @@ namespace compiler
 static uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::Operation &node)
 {
   uint32_t size = 0;
-  for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+  for (const auto &ind :
+       (node.getInputs() | ir::Remove::UNDEFINED) + (node.getOutputs() | ir::Remove::UNDEFINED))
   {
     size += graph.operands().at(ind).info().total_size();
   }
@@ -248,8 +249,9 @@ int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
   if (time != _exec_time->NOT_FOUND)
     return time;
 
+  // FIXME permute time is not recorded so the control reaches here always
   // Makes the scheduler prefer keeping computations on one backend
-  return size / 200;
+  return size / 400;
 }
 
 int64_t HEScheduler::tryBackend(const ir::Operation &node, const backend::Backend *backend)
@@ -370,7 +372,7 @@ int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
 {
   const auto &node = _graph->operations().at(index);
   int64_t max_child_rank = 0;
-  for (const auto &output : node.getOutputs())
+  for (const auto &output : node.getOutputs() | ir::Remove::UNDEFINED)
   {
     const auto &operand = _graph->operands().at(output);
     const bool quant = operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
diff --git a/runtime/onert/core/src/compiler/Linear.cc b/runtime/onert/core/src/compiler/Linear.cc
index 49a989500..39e58fe11 100644
--- a/runtime/onert/core/src/compiler/Linear.cc
+++ b/runtime/onert/core/src/compiler/Linear.cc
@@ -148,6 +148,9 @@ void Linear::planTensors(const compiler::LoweredGraph &lowered_graph,
     tensor_builder->notifyFirstUse(ind);
   }
 
+  const auto io_tensors =
+      (graph.getInputs() + graph.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
   // At each operation,
   // 1. Scan DEF of outputs. If the DEF, allocate it
   // 2. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
@@ -182,7 +185,15 @@ void Linear::planTensors(const compiler::LoweredGraph &lowered_graph,
           // plan for deallocation of dynamic tensor
           auto dyn_tensor_manager = tensor_builder_map[ind]->dynamicTensorManager();
           if (dyn_tensor_manager)
-            dyn_tensor_manager->planDealloc(op_idx, ind);
+          {
+            const auto *backend =
+                lowered_graph.getLowerInfo(ind)->def_factors().getOnlyElement().backend();
+            auto &tensor_registry = lowered_graph.backend_contexts().at(backend)->tensor_registry;
+            auto *tensor = tensor_registry->getITensor(ind);
+            assert(tensor);
+            if (!io_tensors.contains(ind)) // I/O tensors cannot be deallocated
+              dyn_tensor_manager->planDealloc(op_idx, tensor);
+          }
         }
       }
     }
diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc
index 1489a1884..cdf1a8158 100644
--- a/runtime/onert/core/src/compiler/LoweredGraph.cc
+++ b/runtime/onert/core/src/compiler/LoweredGraph.cc
@@ -21,6 +21,7 @@
 #include "util/logging.h"
 #include "compiler/pass/ConstantInsertionPass.h"
 #include "compiler/pass/ConstantLoweringPass.h"
+#include "compiler/pass/PassRunner.h"
 #include "compiler/pass/PermutationOperationPass.h"
 #include "compiler/pass/PermutationInsertionPass.h"
 #include "compiler/pass/PermutationEliminationPass.h"
@@ -101,14 +102,14 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
       std::reverse(std::begin(op_seq.operations()), std::end(op_seq.operations()));
     });
 
-    VERBOSE(OpSequences) << "dump without permutation" << std::endl;
+    VERBOSE(OpSequences) << "dump before permutation insertion" << std::endl;
     dumpOpSequences(_op_seqs, _graph.operations());
 
-    pass::ConstantInsertionPass ci_pass(*this);
-    ci_pass.run();
-
-    pass::ConstantLoweringPass cl_pass(*this);
-    cl_pass.run();
+    // Mandatory passes
+    pass::PassRunner{}
+        .append(std::make_unique<pass::ConstantInsertionPass>(*this))
+        .append(std::make_unique<pass::ConstantLoweringPass>(*this))
+        .run();
 
     // Set LowerInfo for each operand from the operand::LowerInfo holder
     manipulateLowerInfo(operands_lower_info, options.is_primary_subgraph);
@@ -116,20 +117,17 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
     dumpLowerInfo();
   }
 
-  // Run Permutation Passes
-  {
-    pass::PermutationOperationPass po_pass(*this);
-    po_pass.run();
-
-    pass::PermutationInsertionPass pi_pass(*this);
-    pi_pass.run();
+  // Mandatory passes
+  pass::PassRunner{}
+      .append(std::make_unique<pass::PermutationOperationPass>(*this))
+      .append(std::make_unique<pass::PermutationInsertionPass>(*this))
+      .run();
 
-    pass::PermutationEliminationPass pe_pass(*this);
-    pe_pass.run();
+  // Optimization passes
+  pass::PassRunner{}.append(std::make_unique<pass::PermutationEliminationPass>(*this)).run();
 
-    VERBOSE(OpSequences) << "dump with permutation" << std::endl;
-    dumpOpSequences(_op_seqs, _graph.operations());
-  }
+  VERBOSE(OpSequences) << "Dump after permutation insertion" << std::endl;
+  dumpOpSequences(_op_seqs, _graph.operations());
 
   // Graph verifications
   {
@@ -276,7 +274,7 @@ void LoweredGraph::makeOpSequences(
           auto &&lower_info = operands_lower_info.at(operand);
           lower_info->addUsePermuteFactor(ir::operand::PermuteFactor{backend, backend_layout});
         }
-        for (auto operand : node.getOutputs())
+        for (auto operand : node.getOutputs() | ir::Remove::UNDEFINED)
         {
           auto &&lower_info = operands_lower_info.at(operand);
           lower_info->addDefPermuteFactor(ir::operand::PermuteFactor{backend, backend_layout});
@@ -340,7 +338,7 @@ void LoweredGraph::manipulateLowerInfo(
       assert(lower_info->def_factors().empty());
       lower_info->addDefPermuteFactor(factor);
     }
-    for (auto index : _graph.getOutputs())
+    for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
     {
       auto &&lower_info = operands_lower_info.at(index);
       lower_info->addUsePermuteFactor(factor);
@@ -368,7 +366,7 @@ void LoweredGraph::manipulateLowerInfo(
       }
     }
   }
-  for (auto index : _graph.getOutputs())
+  for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
   {
     auto &&lower_info = operands_lower_info.at(index);
     if (lower_info->def_factors().size() == 0)
@@ -496,7 +494,7 @@ bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index,
     branched_set.clear();
 
     // Check for branching down
-    for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED)
+    for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       // TODO Fix this workaround for the case of model outputs that are used by another operation
       //      This is needed since the branching is decided by operation, but for model outputs,
@@ -544,7 +542,7 @@ bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index,
       }
 
       // node's input == op_seq's output?
-      for (const auto output : n.getOutputs())
+      for (const auto output : n.getOutputs() | ir::Remove::UNDEFINED)
       {
         if (node_inputs.contains(output))
         {
diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
index f7f659e3e..0582cf154 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.cc
+++ b/runtime/onert/core/src/compiler/OperationValidator.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,7 @@
 
 #include "OperationValidator.h"
 
-#include <typeinfo>
-
 #include "ir/Graph.h"
-#include "ir/operation/LowerInfo.h"
-
-#include "util/logging.h"
-#include "util/Utils.h"
 
 #define OP_REQUIRES(EXP)                                                                         \
   do                                                                                             \
@@ -37,33 +31,14 @@ namespace compiler
 {
 
 OperationValidator::OperationValidator(const ir::Graph &graph)
-    : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN}
+    : _graph{graph}, _ctx{graph.operands()}
 {
 }
 
-void OperationValidator::checkUnaryOp(const ir::Operation &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(0)};
-
-  // Check if I/O types match
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  // Check if I/O shapes match
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
 void OperationValidator::operator()()
 {
-  // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
-  // creating Compiler
   assert(_graph.subgraphs() == nullptr);
 
-  _current_op_seq_layout = _graph.layout();
-
   _graph.operations().iterate(
       [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
 }
@@ -72,50 +47,23 @@ void OperationValidator::visit(const ir::operation::BatchMatMul &node)
 {
   const auto lhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::LHS));
   const auto rhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::RHS));
-  const auto out_index{node.getOutputs().at(0)};
 
   // Constant lhs and rhs is not implemented yet
   OP_REQUIRES(!_ctx.at(lhs_index).isConstant() && !_ctx.at(rhs_index).isConstant());
-
-  if (_ctx.at(out_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() <= 4);
-  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() <= 4);
-  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() >= 2);
-  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() >= 2);
 }
 
 void OperationValidator::visit(const ir::operation::BatchToSpaceND &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-
-  // All requirement as per NNAPI specification.
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
-
-  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
-
+  // Non-constant block_size is not implemented yet
   OP_REQUIRES(_ctx.at(block_size_index).isConstant());
-
-  OP_REQUIRES(input_shape.C == output_shape.C);
 }
 
 void OperationValidator::visit(const ir::operation::Comparison &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  // This validator does not check shape. So checking isDynamic() is skipped.
 
   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
@@ -124,223 +72,20 @@ void OperationValidator::visit(const ir::operation::Comparison &node)
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::BOOL8);
 }
 
-void OperationValidator::visit(const ir::operation::Softmax &node)
-{
-  VERBOSE(Softmax) << "Configure SOFTMAX operation" << std::endl;
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
-}
-
-void OperationValidator::visit(const ir::operation::InstanceNorm &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
-  const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
-  const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
-
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ifm_index).shape() == _ctx.at(ofm_index).shape());
-  OP_REQUIRES(_ctx.at(gamma_index).shape().rank() == 1);
-  OP_REQUIRES(_ctx.at(beta_index).shape().rank() == 1);
-}
-
-void OperationValidator::visit(const ir::operation::Pool2D &node)
+void OperationValidator::visit(const ir::operation::DepthToSpace &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
+  int32_t block_size = node.param().block_size;
 
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(block_size > 0);
 }
 
-void OperationValidator::visit(const ir::operation::Permute &node)
+void OperationValidator::visit(const ir::operation::ElementwiseActivation &node)
 {
-  VERBOSE(Permute) << "Configure Permute operation" << std::endl;
-
   const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
   const auto input_index{node.getInputs().at(0)};
 
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
-}
-
-void OperationValidator::visit(const ir::operation::Reduce &node)
-{
-  VERBOSE(Permute) << "Configure " + node.name() + " operation" << std::endl;
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
-  const auto input_shape = _ctx.at(input_index).shape();
-  const auto output_shape = _ctx.at(output_index).shape();
-
-  OP_REQUIRES(input_shape.rank() <= 4);
-  OP_REQUIRES(output_shape.rank() <= input_shape.rank());
-
-  // NOTE For the 4-dimensions, if the rank of input and output are different, this runtime only
-  // supports cases reducing height and width or reducing depth.
-  // TODO We have to support all cases of dimensions up to 4.
-  // For correct permuting, we have to set output's shape to be equal in dimension position of the
-  // input. But the positions of the same dimensions in the input and output may be set differently.
-  // For example {2,3,4,5}(input's shape) can be reduced to {3,5}(output's shape). The original
-  // output shape should be {1,3,1,5}, but real output shape may be {3,5}. If you simply try to
-  // extend it in 4 dimensions, it should be {1,1,3,5}.
-  // Even if output shape is changed to {1,3,1,5}, there is another problem. It is that shape of
-  // output tensor used at next operation is changed to {1,3,1,5} after this operation even if the
-  // next operation is not desired.
-  if (input_shape.rank() == 4 && input_shape.rank() != output_shape.rank())
-  {
-    if (output_shape.rank() == 2)
-    {
-      // Reducing HW
-      OP_REQUIRES(input_shape.dim(0) == output_shape.dim(0) &&
-                  input_shape.dim(3) == output_shape.dim(1));
-    }
-    else if (output_shape.rank() == 3)
-    {
-      // Reducing C or
-      // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1)
-      OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) &&
-                   input_shape.dim(1) == output_shape.dim(1) &&
-                   input_shape.dim(2) == output_shape.dim(2)) ||
-                  (input_shape.dim(0) == output_shape.dim(0) &&
-                   (input_shape.dim(1) == output_shape.dim(1) ||
-                    input_shape.dim(2) == output_shape.dim(1)) &&
-                   input_shape.dim(3) == 1 && output_shape.dim(2) == 1));
-    }
-  }
-}
-
-void OperationValidator::visit(const ir::operation::Transpose &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
-  const auto &perm{node.param().perm};
-
-  const auto &output_shape = _ctx.at(output_index).shape();
-  const auto &input_shape = _ctx.at(input_index).shape();
-
-  OP_REQUIRES(input_shape.rank() == static_cast<int>(perm.size()));
-  OP_REQUIRES(input_shape.rank() == output_shape.rank());
-}
-
-void OperationValidator::visit(const ir::operation::RNN &node)
-{
-  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
-  // TODO Support dynamic rnn
-  const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto hidden_state_out_index{
-      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
-  const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
-  const auto recurrent_weights_index{
-      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
-  const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
-  const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
-
-  const auto batch_size = _ctx.at(output_index).shape().dim(0);
-  const auto num_units = _ctx.at(output_index).shape().dim(1);
-
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 2 &&
-              _ctx.at(hidden_state_out_index).shape().rank() == 2 &&
-              _ctx.at(input_index).shape().rank() == 2 &&
-              _ctx.at(weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_weights_index).shape().rank() == 2 &&
-              _ctx.at(hidden_state_in_index).shape().rank() == 2);
-  OP_REQUIRES(_ctx.at(bias_index).shape().rank() == 1);
-
-  OP_REQUIRES(batch_size == _ctx.at(input_index).shape().dim(0) &&
-              batch_size == _ctx.at(hidden_state_in_index).shape().dim(0) &&
-              batch_size == _ctx.at(hidden_state_out_index).shape().dim(0));
-  OP_REQUIRES(_ctx.at(input_index).shape().dim(1) == _ctx.at(weights_index).shape().dim(1));
-
-  OP_REQUIRES(num_units == _ctx.at(weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(bias_index).shape().dim(0));
-  OP_REQUIRES(num_units == _ctx.at(output_index).shape().dim(1) &&
-              num_units == _ctx.at(recurrent_weights_index).shape().dim(1) &&
-              num_units == _ctx.at(hidden_state_in_index).shape().dim(1) &&
-              num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
-}
-
-void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
-  const auto block_size_index{
-      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
-  const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-
-  // All requirement as per NNAPI specification.
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
-  OP_REQUIRES(_ctx.at(paddings_index).shape().rank() == 2);
-
-  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
-  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(0) == 2);
-  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(1) == 2);
-
-  OP_REQUIRES(_ctx.at(block_size_index).isConstant());
-  OP_REQUIRES(_ctx.at(paddings_index).isConstant());
-
-  OP_REQUIRES(input_shape.C == output_shape.C);
-}
-
-void OperationValidator::visit(const ir::operation::SpaceToDepth &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-  const auto block_size = node.param().block_size;
-
-  // All assertions as per NNAPI specification.
-  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES((block_size >= 1) && (input_shape.H % block_size == 0) &&
-              (input_shape.W % block_size == 0));
-  OP_REQUIRES(input_shape.N == output_shape.N);
-  OP_REQUIRES(input_shape.C * block_size * block_size == output_shape.C);
-}
-
-void OperationValidator::visit(const ir::operation::ElementwiseActivation &node)
-{
-  checkUnaryOp(node);
+  // Check if I/O types match
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
 }
 
 void OperationValidator::visit(const ir::operation::ElementwiseBinary &node)
@@ -358,9 +103,6 @@ void OperationValidator::visit(const ir::operation::ElementwiseUnary &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 
-  OP_REQUIRES(node.getInputs().size() == 1);
-  OP_REQUIRES(node.getOutputs().size() == 1);
-
   // Check if I/O types match
   if (node.param().op_type == ir::operation::ElementwiseUnary::Type::DEQUANTIZE)
   {
@@ -376,47 +118,13 @@ void OperationValidator::visit(const ir::operation::ElementwiseUnary &node)
   {
     OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
   }
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
 }
 
 void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
-  const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
-
-  const auto &output_obj = _ctx.at(output_index);
-  const auto &lookups_obj = _ctx.at(lookups_index);
-  const auto &values_obj = _ctx.at(values_index);
-
-  // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying
-  // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729)
-  {
-    OP_REQUIRES(lookups_obj.typeInfo().type() == ir::DataType::INT32);
-
-    if (_ctx.at(output_index).info().isDynamic())
-      return;
 
-    const auto &output_shape = output_obj.shape();
-    const auto &lookups_shape = lookups_obj.shape();
-    const auto &values_shape = values_obj.shape();
-
-    OP_REQUIRES(lookups_shape.rank() == 1);
-    OP_REQUIRES(values_shape.rank() >= 2);
-
-    // output should be a n-D tensor with the same rank and shape as the values tensor, except for
-    // the first dimension which has the same size as lookups' only dimension.
-    OP_REQUIRES(output_shape.rank() == values_shape.rank());
-    OP_REQUIRES(output_shape.dim(0) == lookups_shape.dim(0));
-    for (int n = 1; n < output_shape.rank(); ++n)
-    {
-      OP_REQUIRES(output_shape.dim(n) == values_shape.dim(n));
-    }
-  }
+  OP_REQUIRES(_ctx.at(lookups_index).typeInfo().type() == ir::DataType::INT32);
 }
 
 void OperationValidator::visit(const ir::operation::ExpandDims &node)
@@ -427,488 +135,35 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
 
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
   OP_REQUIRES(_ctx.at(axis_index).typeInfo().type() == ir::DataType::INT32);
-
-  if (_ctx.at(axis_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
 }
 
 void OperationValidator::visit(const ir::operation::HashtableLookup &node)
 {
-  const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
-
   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
-  const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
-
-  const auto &output_obj = _ctx.at(output_index);
-  const auto &hits_obj = _ctx.at(hits_index);
-
-  const auto &lookups_obj = _ctx.at(lookups_index);
-  const auto &keys_obj = _ctx.at(keys_index);
-  const auto &values_obj = _ctx.at(values_index);
-
-  OP_REQUIRES(lookups_obj.typeInfo().type() == ir::DataType::INT32);
-  OP_REQUIRES(keys_obj.typeInfo().type() == ir::DataType::INT32);
-  OP_REQUIRES(hits_obj.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
 
-  const auto &output_shape = output_obj.shape();
-  const auto &lookups_shape = lookups_obj.shape();
-  const auto &keys_shape = keys_obj.shape();
-  const auto &values_shape = values_obj.shape();
-
-  OP_REQUIRES(values_shape.rank() == output_shape.rank());
-  OP_REQUIRES(lookups_shape.rank() == 1);
-  OP_REQUIRES(keys_shape.rank() == 1);
-  OP_REQUIRES(values_shape.dim(0) == keys_shape.dim(0));
-  OP_REQUIRES(lookups_shape.dim(0) == output_shape.dim(0));
-}
-
-void OperationValidator::visit(const ir::operation::TransposeConv &node)
-{
-  // param check
-  OP_REQUIRES((node.param().padding.type == ir::PaddingType::SAME) ||
-              (node.param().padding.type == ir::PaddingType::VALID));
-
-  // shape check
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
-  const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
-
-  // Only 4D tensors are supported
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank());
-  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank());
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
-  // The kernel has only IHWO layout on frontend
-  // So ker_shape is treated here below
-  // I -> N
-  // H -> H
-  // W -> W
-  // O -> C
-  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC);
-
-  OP_REQUIRES(ifm_shape.N == ofm_shape.N);
-  OP_REQUIRES(ifm_shape.C == ker_shape.C);
-  OP_REQUIRES(ker_shape.N == ofm_shape.C);
-}
-
-void OperationValidator::visit(const ir::operation::Gather &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
-  const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape();
-  const auto indices_shape = _ctx.at(indices_index).shape();
-  const auto ofm_shape = _ctx.at(ofm_index).shape();
-
-  OP_REQUIRES(ifm_shape.rank() <= 4);
-  OP_REQUIRES(indices_shape.rank() <= 3);
-  OP_REQUIRES(ofm_shape.rank() <= 4);
-}
-
-void OperationValidator::visit(const ir::operation::DepthToSpace &node)
-{
-  // param check
-  int32_t block_size = node.param().block_size;
-
-  OP_REQUIRES(block_size > 0);
-
-  // shape check
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
-
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout);
-  const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout);
-
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
-
-  {
-    OP_REQUIRES(output_shape.N == input_shape.N);
-    OP_REQUIRES(output_shape.H == input_shape.H * block_size);
-    OP_REQUIRES(output_shape.W == input_shape.W * block_size);
-    OP_REQUIRES(input_shape.C % (block_size * block_size) == 0);
-    OP_REQUIRES(output_shape.C == input_shape.C / (block_size * block_size));
-  }
+  OP_REQUIRES(_ctx.at(lookups_index).typeInfo().type() == ir::DataType::INT32);
+  OP_REQUIRES(_ctx.at(keys_index).typeInfo().type() == ir::DataType::INT32);
+  OP_REQUIRES(_ctx.at(hits_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
 }
 
 void OperationValidator::visit(const ir::operation::Pack &node)
 {
-  // param check
   const auto num{node.param().num};
-  const auto axis{node.param().axis};
-  OP_REQUIRES(num == static_cast<int32_t>(node.getInputs().size()));
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  // shape check
-  const auto &output_shape = _ctx.at(output_index).shape();
-  const auto output_rank = static_cast<int32_t>(output_shape.rank());
 
-  const auto input1_index{node.getInputs().at(0)};
-  const auto input_shape = _ctx.at(input1_index).shape();
-
-  OP_REQUIRES(axis >= -output_rank && axis < output_rank);
-  for (const auto &index : node.getInputs())
-  {
-    OP_REQUIRES(input_shape == _ctx.at(index).shape());
-  }
-}
-
-void OperationValidator::visit(const ir::operation::LSTM &node)
-{
-  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
-  // TODO Support dynamic rnn
-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)};
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)};
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)};
-  const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)};
-  const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)};
-  const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
-
-  OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().rank() == 2 &&
-              _ctx.at(output_state_out_index).shape().rank() == 2 &&
-              _ctx.at(cell_state_out_index).shape().rank() == 2 &&
-              _ctx.at(output_index).shape().rank() == 2 &&
-              _ctx.at(input_index).shape().rank() == 2 &&
-              _ctx.at(input_to_input_weights_index).shape().rank() == 2 &&
-              _ctx.at(input_to_forget_weights_index).shape().rank() == 2 &&
-              _ctx.at(input_to_cell_weights_index).shape().rank() == 2 &&
-              _ctx.at(input_to_output_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 &&
-              _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 &&
-              _ctx.at(projection_weights_index).shape().rank() == 2 &&
-              _ctx.at(output_state_in_index).shape().rank() == 2 &&
-              _ctx.at(cell_state_in_index).shape().rank() == 2);
-
-  OP_REQUIRES(_ctx.at(cell_to_input_weights_index).shape().rank() == 1 &&
-              _ctx.at(cell_to_forget_weights_index).shape().rank() == 1 &&
-              _ctx.at(cell_to_output_weights_index).shape().rank() == 1 &&
-              _ctx.at(input_gate_bias_index).shape().rank() == 1 &&
-              _ctx.at(forget_gate_bias_index).shape().rank() == 1 &&
-              _ctx.at(cell_bias_index).shape().rank() == 1 &&
-              _ctx.at(output_gate_bias_index).shape().rank() == 1 &&
-              _ctx.at(projection_bias_index).shape().rank() == 1);
-
-  // CIFG assertion
-  OP_REQUIRES((_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 &&
-               _ctx.at(input_to_input_weights_index).shape().dim(1) == 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0 &&
-               _ctx.at(input_gate_bias_index).shape().dim(0) == 0 &&
-               _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0) ||
-              (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-               _ctx.at(input_to_input_weights_index).shape().dim(1) != 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0 &&
-               _ctx.at(input_gate_bias_index).shape().dim(0) != 0));
-
-  // Peephole assertion
-  OP_REQUIRES((_ctx.at(cell_to_forget_weights_index).shape().dim(0) == 0 &&
-               _ctx.at(cell_to_output_weights_index).shape().dim(0) == 0) ||
-              (_ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0 &&
-               _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0));
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_input_gate_bias = _ctx.at(input_gate_bias_index).shape().dim(0) != 0;
-  bool has_cell_to_input_weights = _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
-  // true: no CIFG
-  // false: CIFG
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_input_weights do not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE The projection weights may have data but the projection bias may not.
-  bool has_projection_param = has_projection_weights;
-
-  const auto batch_size = _ctx.at(input_index).shape().dim(0);
-  OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) &&
-              batch_size == _ctx.at(cell_state_in_index).shape().dim(0) &&
-              batch_size == _ctx.at(scratch_buffer_index).shape().dim(0) &&
-              batch_size == _ctx.at(output_state_out_index).shape().dim(0) &&
-              batch_size == _ctx.at(cell_state_out_index).shape().dim(0) &&
-              batch_size == _ctx.at(output_index).shape().dim(0));
-
-  const auto input_size = _ctx.at(input_index).shape().dim(1);
-  OP_REQUIRES(input_size == _ctx.at(input_to_forget_weights_index).shape().dim(1) &&
-              input_size == _ctx.at(input_to_cell_weights_index).shape().dim(1) &&
-              input_size == _ctx.at(input_to_output_weights_index).shape().dim(1));
-
-  const auto num_units = _ctx.at(cell_state_out_index).shape().dim(1);
-  OP_REQUIRES(num_units == _ctx.at(input_to_forget_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(input_to_cell_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(input_to_output_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_to_forget_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_to_cell_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(recurrent_to_output_weights_index).shape().dim(0) &&
-              num_units == _ctx.at(forget_gate_bias_index).shape().dim(0) &&
-              num_units == _ctx.at(cell_bias_index).shape().dim(0) &&
-              num_units == _ctx.at(output_gate_bias_index).shape().dim(0) &&
-              num_units == _ctx.at(cell_state_in_index).shape().dim(1) &&
-              (((num_units * 3) == _ctx.at(scratch_buffer_index).shape().dim(1)) ||
-               ((num_units * 4) == _ctx.at(scratch_buffer_index).shape().dim(1))));
-
-  const auto output_size = _ctx.at(output_index).shape().dim(1);
-  OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) &&
-              output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) &&
-              output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) &&
-              output_size == _ctx.at(output_state_in_index).shape().dim(1) &&
-              output_size == _ctx.at(output_state_out_index).shape().dim(1));
-
-  if (has_cifg_param)
-  {
-    OP_REQUIRES(input_size == _ctx.at(input_to_input_weights_index).shape().dim(1));
-    OP_REQUIRES(num_units == _ctx.at(input_to_input_weights_index).shape().dim(0) &&
-                num_units == _ctx.at(recurrent_to_input_weights_index).shape().dim(0) &&
-                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
-                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* non-peephole */) &&
-                num_units == _ctx.at(input_gate_bias_index).shape().dim(0));
-    OP_REQUIRES(output_size == _ctx.at(recurrent_to_input_weights_index).shape().dim(1));
-    OP_REQUIRES(has_input_to_input_weights && has_recurrent_to_input_weights &&
-                has_input_gate_bias);
-    if (has_cell_to_input_weights)
-    {
-      // NOTE The cell_to_input_weights exist only in case of non-CIFG and peephole.
-      OP_REQUIRES(has_peephole_param);
-    }
-    OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 4);
-  }
-  else
-  {
-    OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 3);
-  }
-
-  if (has_peephole_param)
-  {
-    OP_REQUIRES(num_units == _ctx.at(cell_to_forget_weights_index).shape().dim(0) &&
-                num_units == _ctx.at(cell_to_output_weights_index).shape().dim(0) &&
-                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
-                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* CIFG */));
-  }
-
-  if (has_projection_param)
-  {
-    OP_REQUIRES(num_units == _ctx.at(projection_weights_index).shape().dim(1));
-    OP_REQUIRES(output_size == _ctx.at(projection_weights_index).shape().dim(0));
-    if (has_projection_bias)
-    {
-      OP_REQUIRES(output_size == _ctx.at(projection_bias_index).shape().dim(0));
-    }
-  }
-}
-
-void OperationValidator::visit(const ir::operation::L2Normalization &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  if (_ctx.at(ofm_index).info().isDynamic())
-    return;
-
-  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
-
-  auto ifm_shape = _ctx.at(ifm_index).shape();
-  auto ofm_shape = _ctx.at(ofm_index).shape();
-
-  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
-
-  for (auto i = 0; i < ifm_shape.rank(); i++)
-  {
-    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
-  }
-}
-
-void OperationValidator::visit(const ir::operation::Unpack &node)
-{
-  const auto num{node.param().num};
-  OP_REQUIRES(num == static_cast<int32_t>(node.getOutputs().size()));
-  const auto axis{node.param().axis};
-
-  const auto output_index{node.getInputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
-
-  const auto &input_shape = _ctx.at(input_index).shape();
-  const auto input_rank = static_cast<int32_t>(input_shape.rank());
-
-  OP_REQUIRES(axis >= -input_rank && axis < input_rank);
+  OP_REQUIRES(num == static_cast<int32_t>(node.getInputs().size()));
 }
 
 void OperationValidator::visit(const ir::operation::Pad &node)
 {
   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
-  OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32);
-
-  const auto output_index{node.getInputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
-
-  const auto &pad_shape = _ctx.at(pad_index).shape();
-  const auto input_rank = static_cast<int32_t>(_ctx.at(input_index).shape().rank());
-
-  OP_REQUIRES(pad_shape.rank() == 2);
-  OP_REQUIRES(pad_shape.dim(0) == input_rank);
-  OP_REQUIRES(pad_shape.dim(1) == 2);
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
-}
-
-void OperationValidator::visit(const ir::operation::Select &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  // This validator does not check shape. So checking isDynamic() is skipped.
-
-  const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)};
-  const auto input_true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
-  const auto input_false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
-  UNUSED_RELEASE(output_index);
-  UNUSED_RELEASE(input_true_index);
-  UNUSED_RELEASE(input_false_index);
-
-  OP_REQUIRES(_ctx.at(condition_index).typeInfo().type() == ir::DataType::BOOL8);
-}
-
-void OperationValidator::visit(const ir::operation::StridedSlice &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
-  const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
-  const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
-  const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
-
-  UNUSED_RELEASE(starts_index);
-  UNUSED_RELEASE(ends_index);
-  UNUSED_RELEASE(strides_index);
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() <= 4);
-}
 
-void OperationValidator::visit(const ir::operation::Split &node)
-{
-  const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
-
-  if (_ctx.at(input_index).info().isDynamic())
-    return;
-
-  const auto num_splits = node.param().num_splits;
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-  const auto axis = node.param().axis < 0 ? node.param().axis + input_rank : node.param().axis;
-
-  OP_REQUIRES(num_splits > 0 && num_splits <= 0xFFFF);
-  OP_REQUIRES(axis >= 0 && axis < input_rank);
-  OP_REQUIRES(node.getOutputs().size() == static_cast<uint32_t>(num_splits));
-
-  OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
-}
-
-void OperationValidator::visit(const ir::operation::Shape &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  UNUSED_RELEASE(input_index);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32);
 }
 
 void OperationValidator::visit(const ir::operation::ResizeBilinear &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
-
-  if (_ctx.at(output_index).info().isDynamic())
-  {
-    return;
-  }
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
-
   auto align_corners = node.param().align_corners;
   auto half_pixel_centers = node.param().half_pixel_centers;
 
@@ -923,23 +178,31 @@ void OperationValidator::visit(const ir::operation::Reverse &node)
 
   OP_REQUIRES(_ctx.at(axis_index).typeInfo().type() == ir::DataType::INT32);
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+}
+
+void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
+{
+  const auto block_size_index{
+      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+  const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+  // Non-constant block_size and padding is not implemented yet
+  OP_REQUIRES(_ctx.at(block_size_index).isConstant());
+  OP_REQUIRES(_ctx.at(paddings_index).isConstant());
 }
 
-void OperationValidator::visit(const ir::operation::If &)
+void OperationValidator::visit(const ir::operation::SpaceToDepth &node)
 {
-  // TODO Add to validate with subgraphs
+  const auto block_size = node.param().block_size;
+  OP_REQUIRES(block_size >= 1);
 }
 
-void OperationValidator::visit(const ir::operation::While &node)
+void OperationValidator::visit(const ir::operation::Split &node)
 {
-  // This validator does not check shape. So checking isDynamic() is skipped.
+  const auto num_splits = node.param().num_splits;
 
-  OP_REQUIRES(node.getInputs().size() == node.getOutputs().size());
-  // TODO Add to validate with subgraphs
+  OP_REQUIRES(num_splits > 0 && num_splits <= 0xFFFF);
+  OP_REQUIRES(node.getOutputs().size() == static_cast<uint32_t>(num_splits));
 }
 
 void OperationValidator::visit(const ir::operation::SquaredDifference &node)
@@ -948,105 +211,33 @@ void OperationValidator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  // Check for Type equivalence
   OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(lhs_index).typeInfo().type());
   OP_REQUIRES(_ctx.at(lhs_index).typeInfo().type() == _ctx.at(rhs_index).typeInfo().type());
-
-  // Check for dimension constraints
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  auto output_shape = _ctx.at(output_index).shape();
-  auto lhs_shape = _ctx.at(lhs_index).shape();
-  auto rhs_shape = _ctx.at(rhs_index).shape();
-  // Check for output rank
-  OP_REQUIRES(output_shape.rank() == std::max(lhs_shape.rank(), rhs_shape.rank()));
-  auto min_rank = std::min(lhs_shape.rank(), rhs_shape.rank());
-
-  for (int idx = 1; idx <= min_rank; idx++)
-  {
-    int l_idx = lhs_shape.rank() - idx;
-    int r_idx = rhs_shape.rank() - idx;
-    int out_idx = output_shape.rank() - idx;
-
-    OP_REQUIRES((l_idx >= 0) && (r_idx >= 0) && (out_idx >= 0));
-
-    auto l_dims = lhs_shape.dim(l_idx);
-    auto r_dims = rhs_shape.dim(r_idx);
-    auto out_dims = output_shape.dim(out_idx);
-
-    OP_REQUIRES(((l_dims == r_dims) && (out_dims == l_dims)) ||
-                ((l_dims == 1) && (out_dims == r_dims)) || ((r_dims == 1) && (out_dims == l_dims)));
-  }
-  auto &tmp_shape = (lhs_shape.rank() > rhs_shape.rank()) ? lhs_shape : rhs_shape;
-  for (int idx = min_rank + 1; idx <= output_shape.rank(); idx++)
-  {
-    int out_idx = output_shape.rank() - idx;
-    int tmp_idx = tmp_shape.rank() - idx;
-
-    OP_REQUIRES((out_idx >= 0) && (tmp_idx >= 0) &&
-                (output_shape.dim(out_idx) == tmp_shape.dim(tmp_idx)));
-  }
 }
-void OperationValidator::visit(const ir::operation::Tile &node)
+
+void OperationValidator::visit(const ir::operation::StridedSlice &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  const auto multiple_index{node.getInputs().at(1)};
+  const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 
-  OP_REQUIRES(_ctx.at(multiple_index).shape().rank() == 1);
-  OP_REQUIRES(_ctx.at(multiple_index).shape().dim(0) == _ctx.at(input_index).shape().rank());
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
 }
 
-void OperationValidator::visit(const ir::operation::Range &node)
+void OperationValidator::visit(const ir::operation::TransposeConv &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
-  const auto start_index{node.getInputs().at(ir::operation::Range::Input::START)};
-  const auto limit_index{node.getInputs().at(ir::operation::Range::Input::LIMIT)};
-  const auto delta_index{node.getInputs().at(ir::operation::Range::Input::DELTA)};
-
-  // Check for dimension constraints
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(start_index).shape().rank() == 0);
-  OP_REQUIRES(_ctx.at(limit_index).shape().rank() == 0);
-  OP_REQUIRES(_ctx.at(delta_index).shape().rank() == 0);
+  OP_REQUIRES((node.param().padding.type == ir::PaddingType::SAME) ||
+              (node.param().padding.type == ir::PaddingType::VALID));
 }
 
-void OperationValidator::visit(const ir::operation::MatrixBandPart &node)
+void OperationValidator::visit(const ir::operation::Unpack &node)
 {
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)};
-  const auto num_lower_index{
-      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)};
-  const auto num_upper_index{
-      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)};
-
-  // Check for dimension constraints
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  OP_REQUIRES(_ctx.at(input_index).shape().rank() >= 2);     // input must be more than 2 dim matrix
-  OP_REQUIRES(_ctx.at(num_upper_index).shape().rank() == 0); // num_lower must be scalar
-  OP_REQUIRES(_ctx.at(num_lower_index).shape().rank() == 0); // num_upper must be scalar
+  const auto num{node.param().num};
+  OP_REQUIRES(num == static_cast<int32_t>(node.getOutputs().size()));
 }
 
-void OperationValidator::visit(const ir::operation::LogSoftmax &node)
+void OperationValidator::visit(const ir::operation::While &node)
 {
-  VERBOSE(LogSoftmax) << "Configure LOGSOFTMAX operation" << std::endl;
-
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+  OP_REQUIRES(node.getInputs().size() == node.getOutputs().size());
 }
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
index deb6357bb..f884a3765 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.h
+++ b/runtime/onert/core/src/compiler/OperationValidator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #ifndef __ONERT_COMPILER_OPERATION_VALIDATOR_H__
 #define __ONERT_COMPILER_OPERATION_VALIDATOR_H__
 
-#include "ir/Layout.h"
 #include "ir/OperationVisitor.h"
 
 namespace onert
@@ -47,51 +46,30 @@ public:
   void visit(const ir::operation::BatchMatMul &node) override;
   void visit(const ir::operation::BatchToSpaceND &node) override;
   void visit(const ir::operation::Comparison &node) override;
-  void visit(const ir::operation::Softmax &node) override;
-  void visit(const ir::operation::InstanceNorm &node) override;
-  void visit(const ir::operation::Permute &node) override;
-  void visit(const ir::operation::Pool2D &node) override;
-  void visit(const ir::operation::Reduce &node) override;
-  void visit(const ir::operation::Transpose &node) override;
-  void visit(const ir::operation::RNN &node) override;
-  void visit(const ir::operation::SpaceToBatchND &node) override;
-  void visit(const ir::operation::SpaceToDepth &node) override;
+  void visit(const ir::operation::DepthToSpace &node) override;
   void visit(const ir::operation::ElementwiseActivation &node) override;
   void visit(const ir::operation::ElementwiseBinary &node) override;
   void visit(const ir::operation::ElementwiseUnary &node) override;
   void visit(const ir::operation::EmbeddingLookup &node) override;
   void visit(const ir::operation::ExpandDims &node) override;
   void visit(const ir::operation::HashtableLookup &node) override;
-  void visit(const ir::operation::TransposeConv &node) override;
-  void visit(const ir::operation::Gather &node) override;
-  void visit(const ir::operation::DepthToSpace &node) override;
   void visit(const ir::operation::Pack &node) override;
-  void visit(const ir::operation::LSTM &node) override;
-  void visit(const ir::operation::L2Normalization &node) override;
-  void visit(const ir::operation::Unpack &node) override;
   void visit(const ir::operation::Pad &node) override;
-  void visit(const ir::operation::Select &node) override;
-  void visit(const ir::operation::StridedSlice &node) override;
-  void visit(const ir::operation::Split &node) override;
-  void visit(const ir::operation::Shape &node) override;
   void visit(const ir::operation::ResizeBilinear &node) override;
   void visit(const ir::operation::Reverse &node) override;
-  void visit(const ir::operation::If &node) override;
-  void visit(const ir::operation::While &node) override;
+  void visit(const ir::operation::SpaceToBatchND &node) override;
+  void visit(const ir::operation::SpaceToDepth &node) override;
+  void visit(const ir::operation::Split &node) override;
   void visit(const ir::operation::SquaredDifference &node) override;
-  void visit(const ir::operation::Tile &node) override;
-  void visit(const ir::operation::Range &node) override;
-  void visit(const ir::operation::MatrixBandPart &node) override;
-  void visit(const ir::operation::LogSoftmax &node) override;
-
-private:
-  void checkUnaryOp(const ir::Operation &node);
+  void visit(const ir::operation::StridedSlice &node) override;
+  void visit(const ir::operation::TransposeConv &node) override;
+  void visit(const ir::operation::Unpack &node) override;
+  void visit(const ir::operation::While &node) override;
 
 private:
   // TODO Remove _ctx field
   const ir::Graph &_graph;
   const ir::Operands &_ctx;
-  ir::Layout _current_op_seq_layout;
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.cc b/runtime/onert/core/src/compiler/ShapeValidator.cc
new file mode 100644
index 000000000..8be4fe6ec
--- /dev/null
+++ b/runtime/onert/core/src/compiler/ShapeValidator.cc
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ShapeValidator.h"
+
+#include <typeinfo>
+
+#include "ir/Graph.h"
+#include "ir/operation/LowerInfo.h"
+
+#include "util/logging.h"
+#include "util/Utils.h"
+
+#define OP_REQUIRES(EXP)                                                                     \
+  do                                                                                         \
+  {                                                                                          \
+    if (!(EXP))                                                                              \
+      throw std::runtime_error("ShapeValidator failed at line " + std::to_string(__LINE__)); \
+  } while (0)
+
+namespace onert
+{
+namespace compiler
+{
+
+ShapeValidator::ShapeValidator(const ir::Graph &graph)
+    : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN}
+{
+}
+
+void ShapeValidator::checkUnaryOp(const ir::Operation &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(0)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  // Check if I/O shapes match
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
+void ShapeValidator::operator()()
+{
+  // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
+  // creating Compiler
+  assert(_graph.subgraphs() == nullptr);
+
+  _current_op_seq_layout = _graph.layout();
+
+  _graph.operations().iterate(
+      [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
+}
+
+void ShapeValidator::visit(const ir::operation::BatchMatMul &node)
+{
+  const auto lhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::LHS));
+  const auto rhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::RHS));
+  const auto out_index{node.getOutputs().at(0)};
+
+  if (_ctx.at(out_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() <= 4);
+  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() <= 4);
+  OP_REQUIRES(_ctx.at(lhs_index).shape().rank() >= 2);
+  OP_REQUIRES(_ctx.at(rhs_index).shape().rank() >= 2);
+}
+
+void ShapeValidator::visit(const ir::operation::BatchToSpaceND &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
+  const auto block_size_index{
+      node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+
+  // All requirement as per NNAPI specification.
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
+
+  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
+
+  OP_REQUIRES(input_shape.C == output_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::BCQFullyConnected &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
+  const auto weight_scales_index{
+      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_SCALES)};
+  const auto weight_binary_index{
+      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_BINARY)};
+  const auto weight_cluster_index{
+      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+  // const auto bias_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::BIAS)};
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(weight_scales_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(weight_binary_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(weight_cluster_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().dim(1) == _ctx.at(ofm_index).shape().dim(1));
+
+  OP_REQUIRES(_ctx.at(weight_cluster_index).shape().dim(0) > 0);
+  OP_REQUIRES(_ctx.at(weight_cluster_index).shape().dim(1) == 2);
+
+  // more shape validation will be done inside kernel.
+
+  // TODO Check bias dimension (can be null tensor)
+}
+
+void ShapeValidator::visit(const ir::operation::BCQGather &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto indices_index{node.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto input_binary_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)};
+  const auto input_scales_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_SCALES)};
+  const auto input_clusters_index{
+      node.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+
+  OP_REQUIRES(_ctx.at(indices_index).shape().rank() <= 2); // TODO : support rank up to 4 or more
+  OP_REQUIRES(_ctx.at(input_binary_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(input_scales_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(input_clusters_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(input_clusters_index).shape().dim(0) > 0);
+  OP_REQUIRES(_ctx.at(input_clusters_index).shape().dim(1) == 2);
+
+  // more shape validation will be done inside kernel.
+}
+
+void ShapeValidator::visit(const ir::operation::Comparison &)
+{
+  // TODO Shape validation of comparison
+}
+
+void ShapeValidator::visit(const ir::operation::Softmax &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::InstanceNorm &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
+  const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+  const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ifm_index).shape() == _ctx.at(ofm_index).shape());
+  OP_REQUIRES(_ctx.at(gamma_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(beta_index).shape().rank() == 1);
+}
+
+void ShapeValidator::visit(const ir::operation::Pool2D &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
+
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+}
+
+void ShapeValidator::visit(const ir::operation::Permute &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::Reduce &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
+  const auto input_shape = _ctx.at(input_index).shape();
+  const auto output_shape = _ctx.at(output_index).shape();
+
+  OP_REQUIRES(input_shape.rank() <= 4);
+  OP_REQUIRES(output_shape.rank() <= input_shape.rank());
+
+  // NOTE For the 4-dimensions, if the rank of input and output are different, this runtime only
+  // supports cases reducing height and width or reducing depth.
+  // TODO We have to support all cases of dimensions up to 4.
+  // For correct permuting, we have to set output's shape to be equal in dimension position of the
+  // input. But the positions of the same dimensions in the input and output may be set differently.
+  // For example {2,3,4,5}(input's shape) can be reduced to {3,5}(output's shape). The original
+  // output shape should be {1,3,1,5}, but real output shape may be {3,5}. If you simply try to
+  // extend it in 4 dimensions, it should be {1,1,3,5}.
+  // Even if output shape is changed to {1,3,1,5}, there is another problem. It is that shape of
+  // output tensor used at next operation is changed to {1,3,1,5} after this operation even if the
+  // next operation is not desired.
+  if (input_shape.rank() == 4 && input_shape.rank() != output_shape.rank())
+  {
+    if (output_shape.rank() == 2)
+    {
+      // Reducing HW
+      OP_REQUIRES(input_shape.dim(0) == output_shape.dim(0) &&
+                  input_shape.dim(3) == output_shape.dim(1));
+    }
+    else if (output_shape.rank() == 3)
+    {
+      // Reducing C or
+      // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1)
+      OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) &&
+                   input_shape.dim(1) == output_shape.dim(1) &&
+                   input_shape.dim(2) == output_shape.dim(2)) ||
+                  (input_shape.dim(0) == output_shape.dim(0) &&
+                   (input_shape.dim(1) == output_shape.dim(1) ||
+                    input_shape.dim(2) == output_shape.dim(1)) &&
+                   input_shape.dim(3) == 1 && output_shape.dim(2) == 1));
+    }
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::Transpose &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+  const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
+
+  const auto &output_shape = _ctx.at(output_index).shape();
+  const auto &input_shape = _ctx.at(input_index).shape();
+
+  OP_REQUIRES(_ctx.at(perm_index).shape().num_elements() == 0 ||
+              input_shape.rank() == static_cast<int>(_ctx.at(perm_index).shape().num_elements()));
+  OP_REQUIRES(input_shape.rank() == output_shape.rank());
+}
+
+void ShapeValidator::visit(const ir::operation::RNN &node)
+{
+  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
+  // TODO Support dynamic rnn
+  const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto hidden_state_out_index{
+      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
+
+  const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
+  const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
+  const auto recurrent_weights_index{
+      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
+  const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
+  const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
+
+  const auto batch_size = _ctx.at(output_index).shape().dim(0);
+  const auto num_units = _ctx.at(output_index).shape().dim(1);
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 2 &&
+              _ctx.at(hidden_state_out_index).shape().rank() == 2 &&
+              _ctx.at(input_index).shape().rank() == 2 &&
+              _ctx.at(weights_index).shape().rank() == 2 &&
+              _ctx.at(recurrent_weights_index).shape().rank() == 2 &&
+              _ctx.at(hidden_state_in_index).shape().rank() == 2);
+  OP_REQUIRES(_ctx.at(bias_index).shape().rank() == 1);
+
+  OP_REQUIRES(batch_size == _ctx.at(input_index).shape().dim(0) &&
+              batch_size == _ctx.at(hidden_state_in_index).shape().dim(0) &&
+              batch_size == _ctx.at(hidden_state_out_index).shape().dim(0));
+  OP_REQUIRES(_ctx.at(input_index).shape().dim(1) == _ctx.at(weights_index).shape().dim(1));
+
+  OP_REQUIRES(num_units == _ctx.at(weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(bias_index).shape().dim(0));
+  OP_REQUIRES(num_units == _ctx.at(output_index).shape().dim(1) &&
+              num_units == _ctx.at(recurrent_weights_index).shape().dim(1) &&
+              num_units == _ctx.at(hidden_state_in_index).shape().dim(1) &&
+              num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
+}
+
+void ShapeValidator::visit(const ir::operation::SpaceToBatchND &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
+  const auto block_size_index{
+      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+  const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+
+  // All requirement as per NNAPI specification.
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(paddings_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2);
+  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(0) == 2);
+  OP_REQUIRES(_ctx.at(paddings_index).shape().dim(1) == 2);
+
+  OP_REQUIRES(input_shape.C == output_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::SpaceToDepth &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+  const auto block_size = node.param().block_size;
+
+  // All assertions as per NNAPI specification.
+  OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES((input_shape.H % block_size == 0) && (input_shape.W % block_size == 0));
+  OP_REQUIRES(input_shape.N == output_shape.N);
+  OP_REQUIRES(input_shape.C * block_size * block_size == output_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::ElementwiseActivation &node) { checkUnaryOp(node); }
+
+void ShapeValidator::visit(const ir::operation::ElementwiseBinary &)
+{
+  // TODO Shape validation of ElementwiseBinary
+}
+
+void ShapeValidator::visit(const ir::operation::ElementwiseUnary &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
+void ShapeValidator::visit(const ir::operation::EmbeddingLookup &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+  const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+
+  const auto &output_obj = _ctx.at(output_index);
+  const auto &lookups_obj = _ctx.at(lookups_index);
+  const auto &values_obj = _ctx.at(values_index);
+
+  // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying
+  // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729)
+  {
+    if (_ctx.at(output_index).info().isDynamic())
+      return;
+
+    const auto &output_shape = output_obj.shape();
+    const auto &lookups_shape = lookups_obj.shape();
+    const auto &values_shape = values_obj.shape();
+
+    OP_REQUIRES(lookups_shape.rank() == 1);
+    OP_REQUIRES(values_shape.rank() >= 2);
+
+    // output should be a n-D tensor with the same rank and shape as the values tensor, except for
+    // the first dimension which has the same size as lookups' only dimension.
+    OP_REQUIRES(output_shape.rank() == values_shape.rank());
+    OP_REQUIRES(output_shape.dim(0) == lookups_shape.dim(0));
+    for (int n = 1; n < output_shape.rank(); ++n)
+    {
+      OP_REQUIRES(output_shape.dim(n) == values_shape.dim(n));
+    }
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::ExpandDims &node)
+{
+  const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
+
+  if (_ctx.at(axis_index).info().isDynamic())
+    return;
+  OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
+}
+
+void ShapeValidator::visit(const ir::operation::HashtableLookup &node)
+{
+  const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
+  const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
+  const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+  const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+
+  const auto &output_obj = _ctx.at(output_index);
+  const auto &lookups_obj = _ctx.at(lookups_index);
+  const auto &keys_obj = _ctx.at(keys_index);
+  const auto &values_obj = _ctx.at(values_index);
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto &output_shape = output_obj.shape();
+  const auto &lookups_shape = lookups_obj.shape();
+  const auto &keys_shape = keys_obj.shape();
+  const auto &values_shape = values_obj.shape();
+
+  OP_REQUIRES(values_shape.rank() == output_shape.rank());
+  OP_REQUIRES(lookups_shape.rank() == 1);
+  OP_REQUIRES(keys_shape.rank() == 1);
+  OP_REQUIRES(values_shape.dim(0) == keys_shape.dim(0));
+  OP_REQUIRES(lookups_shape.dim(0) == output_shape.dim(0));
+}
+
+void ShapeValidator::visit(const ir::operation::TransposeConv &node)
+{
+  // shape check
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
+
+  // Only 4D tensors are supported
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank());
+  OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank());
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
+  // The kernel has only IHWO layout on frontend
+  // So ker_shape is treated here below
+  // I -> N
+  // H -> H
+  // W -> W
+  // O -> C
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC);
+
+  OP_REQUIRES(ifm_shape.N == ofm_shape.N);
+  OP_REQUIRES(ifm_shape.C == ker_shape.C);
+  OP_REQUIRES(ker_shape.N == ofm_shape.C);
+}
+
+void ShapeValidator::visit(const ir::operation::Gather &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+  const auto indices_shape = _ctx.at(indices_index).shape();
+  const auto ofm_shape = _ctx.at(ofm_index).shape();
+
+  OP_REQUIRES(ifm_shape.rank() <= 4);
+  OP_REQUIRES(indices_shape.rank() <= 3);
+  OP_REQUIRES(ofm_shape.rank() <= 4);
+}
+
+void ShapeValidator::visit(const ir::operation::DepthToSpace &node)
+{
+  int32_t block_size = node.param().block_size;
+
+  // shape check
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
+
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout);
+  const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout);
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
+
+  {
+    OP_REQUIRES(output_shape.N == input_shape.N);
+    OP_REQUIRES(output_shape.H == input_shape.H * block_size);
+    OP_REQUIRES(output_shape.W == input_shape.W * block_size);
+    OP_REQUIRES(input_shape.C % (block_size * block_size) == 0);
+    OP_REQUIRES(output_shape.C == input_shape.C / (block_size * block_size));
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::Pack &node)
+{
+  const auto axis{node.param().axis};
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  // shape check
+  const auto &output_shape = _ctx.at(output_index).shape();
+  const auto output_rank = static_cast<int32_t>(output_shape.rank());
+
+  const auto input1_index{node.getInputs().at(0)};
+  const auto input_shape = _ctx.at(input1_index).shape();
+
+  OP_REQUIRES(axis >= -output_rank && axis < output_rank);
+  for (const auto &index : node.getInputs())
+  {
+    OP_REQUIRES(input_shape == _ctx.at(index).shape());
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::LSTM &node)
+{
+  // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn
+  // TODO Support dynamic rnn
+  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto scratch_buffer_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+  const auto output_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+  const auto cell_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+
+  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto input_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+  const auto input_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+  const auto input_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+  const auto input_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+  const auto recurrent_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+  const auto recurrent_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+  const auto recurrent_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto cell_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)};
+  const auto cell_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)};
+  const auto cell_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)};
+  const auto input_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+  const auto forget_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+  const auto output_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+  const auto projection_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)};
+  const auto projection_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)};
+  const auto output_state_in_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+  for (int i = 0; i < _ctx.at(input_index).shape().rank() - 1; ++i)
+  {
+    OP_REQUIRES(_ctx.at(input_index).shape().dim(i) == _ctx.at(output_index).shape().dim(i));
+  }
+  OP_REQUIRES(
+      (_ctx.at(output_index).shape().rank() == 2 || _ctx.at(output_index).shape().rank() == 3) &&
+      (_ctx.at(input_index).shape().rank() == 2 || _ctx.at(input_index).shape().rank() == 3) &&
+      _ctx.at(input_to_input_weights_index).shape().rank() == 2 &&
+      _ctx.at(input_to_forget_weights_index).shape().rank() == 2 &&
+      _ctx.at(input_to_cell_weights_index).shape().rank() == 2 &&
+      _ctx.at(input_to_output_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 &&
+      _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 &&
+      _ctx.at(projection_weights_index).shape().rank() == 2 &&
+      _ctx.at(output_state_in_index).shape().rank() == 2 &&
+      _ctx.at(cell_state_in_index).shape().rank() == 2);
+
+  OP_REQUIRES(_ctx.at(cell_to_input_weights_index).shape().rank() == 1 &&
+              _ctx.at(cell_to_forget_weights_index).shape().rank() == 1 &&
+              _ctx.at(cell_to_output_weights_index).shape().rank() == 1 &&
+              _ctx.at(input_gate_bias_index).shape().rank() == 1 &&
+              _ctx.at(forget_gate_bias_index).shape().rank() == 1 &&
+              _ctx.at(cell_bias_index).shape().rank() == 1 &&
+              _ctx.at(output_gate_bias_index).shape().rank() == 1 &&
+              _ctx.at(projection_bias_index).shape().rank() == 1);
+
+  // CIFG assertion
+  OP_REQUIRES((_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 &&
+               _ctx.at(input_to_input_weights_index).shape().dim(1) == 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0 &&
+               _ctx.at(input_gate_bias_index).shape().dim(0) == 0 &&
+               _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0) ||
+              (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+               _ctx.at(input_to_input_weights_index).shape().dim(1) != 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+               _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0 &&
+               _ctx.at(input_gate_bias_index).shape().dim(0) != 0));
+
+  // Peephole assertion
+  OP_REQUIRES((_ctx.at(cell_to_forget_weights_index).shape().dim(0) == 0 &&
+               _ctx.at(cell_to_output_weights_index).shape().dim(0) == 0) ||
+              (_ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0 &&
+               _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0));
+
+  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+  bool has_recurrent_to_input_weights =
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+  bool has_input_gate_bias = _ctx.at(input_gate_bias_index).shape().dim(0) != 0;
+  bool has_cell_to_input_weights = _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
+  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+
+  // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
+  // true: no CIFG
+  // false: CIFG
+  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+  // NOTE The cell_to_input_weights do not exist in regular CIFG although peephole.
+  // true: peephole
+  // false: no peephole
+  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+  // NOTE The projection weights may have data but the projection bias may not.
+  bool has_projection_param = has_projection_weights;
+
+  const auto batch_size = (_ctx.at(input_index).shape().rank() == 3 && node.param().time_major)
+                              ? _ctx.at(input_index).shape().dim(1)
+                              : _ctx.at(input_index).shape().dim(0);
+  OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) &&
+              batch_size == _ctx.at(cell_state_in_index).shape().dim(0));
+
+  const auto input_size = _ctx.at(input_index).shape().dim(_ctx.at(input_index).shape().rank() - 1);
+  OP_REQUIRES(input_size == _ctx.at(input_to_forget_weights_index).shape().dim(1) &&
+              input_size == _ctx.at(input_to_cell_weights_index).shape().dim(1) &&
+              input_size == _ctx.at(input_to_output_weights_index).shape().dim(1));
+
+  const auto num_units = _ctx.at(input_to_output_weights_index).shape().dim(0);
+  OP_REQUIRES(num_units == _ctx.at(input_to_cell_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(input_to_output_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_to_forget_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_to_cell_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(recurrent_to_output_weights_index).shape().dim(0) &&
+              num_units == _ctx.at(forget_gate_bias_index).shape().dim(0) &&
+              num_units == _ctx.at(cell_bias_index).shape().dim(0) &&
+              num_units == _ctx.at(output_gate_bias_index).shape().dim(0) &&
+              num_units == _ctx.at(cell_state_in_index).shape().dim(1));
+
+  const auto output_size =
+      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+  OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) &&
+              output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) &&
+              output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) &&
+              output_size == _ctx.at(output_state_in_index).shape().dim(1));
+
+  if (has_cifg_param)
+  {
+    OP_REQUIRES(input_size == _ctx.at(input_to_input_weights_index).shape().dim(1));
+    OP_REQUIRES(num_units == _ctx.at(input_to_input_weights_index).shape().dim(0) &&
+                num_units == _ctx.at(recurrent_to_input_weights_index).shape().dim(0) &&
+                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
+                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* non-peephole */) &&
+                num_units == _ctx.at(input_gate_bias_index).shape().dim(0));
+    OP_REQUIRES(output_size == _ctx.at(recurrent_to_input_weights_index).shape().dim(1));
+    OP_REQUIRES(has_input_to_input_weights && has_recurrent_to_input_weights &&
+                has_input_gate_bias);
+    if (has_cell_to_input_weights)
+    {
+      // NOTE The cell_to_input_weights exist only in case of non-CIFG and peephole.
+      OP_REQUIRES(has_peephole_param);
+    }
+    if (_ctx.exist(scratch_buffer_index))
+      OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 4);
+  }
+  else
+  {
+    if (_ctx.exist(scratch_buffer_index))
+      OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 3);
+  }
+
+  if (has_peephole_param)
+  {
+    OP_REQUIRES(num_units == _ctx.at(cell_to_forget_weights_index).shape().dim(0) &&
+                num_units == _ctx.at(cell_to_output_weights_index).shape().dim(0) &&
+                (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) ||
+                 _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* CIFG */));
+  }
+
+  if (has_projection_param)
+  {
+    OP_REQUIRES(num_units == _ctx.at(projection_weights_index).shape().dim(1));
+    OP_REQUIRES(output_size == _ctx.at(projection_weights_index).shape().dim(0));
+    if (has_projection_bias)
+    {
+      OP_REQUIRES(output_size == _ctx.at(projection_bias_index).shape().dim(0));
+    }
+  }
+
+  if (_ctx.exist(scratch_buffer_index))
+  {
+    OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().rank() == 2);
+    OP_REQUIRES(batch_size == _ctx.at(scratch_buffer_index).shape().dim(0));
+  }
+
+  if (_ctx.exist(output_state_out_index))
+  {
+    OP_REQUIRES(_ctx.at(output_state_out_index).shape().rank() == 2);
+    OP_REQUIRES(batch_size == _ctx.at(output_state_out_index).shape().dim(0));
+    OP_REQUIRES(output_size == _ctx.at(output_state_out_index).shape().dim(1));
+  }
+
+  if (_ctx.exist(cell_state_out_index))
+  {
+    OP_REQUIRES(_ctx.at(cell_state_out_index).shape().rank() == 2);
+    OP_REQUIRES(batch_size == _ctx.at(cell_state_out_index).shape().dim(0));
+    OP_REQUIRES(num_units == _ctx.at(cell_state_out_index).shape().dim(1));
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::L2Normalization &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+
+  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
+
+  for (auto i = 0; i < ifm_shape.rank(); i++)
+  {
+    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
+  }
+}
+
+void ShapeValidator::visit(const ir::operation::Unpack &node)
+{
+  const auto axis{node.param().axis};
+  const auto output_index{node.getInputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
+
+  const auto &input_shape = _ctx.at(input_index).shape();
+  const auto input_rank = static_cast<int32_t>(input_shape.rank());
+
+  OP_REQUIRES(axis >= -input_rank && axis < input_rank);
+}
+
+void ShapeValidator::visit(const ir::operation::Pad &node)
+{
+  const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
+  OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32);
+
+  const auto output_index{node.getInputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
+
+  const auto &pad_shape = _ctx.at(pad_index).shape();
+  const auto input_rank = static_cast<int32_t>(_ctx.at(input_index).shape().rank());
+
+  OP_REQUIRES(pad_shape.rank() == 2);
+  OP_REQUIRES(pad_shape.dim(0) == input_rank);
+  OP_REQUIRES(pad_shape.dim(1) == 2);
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::Select &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  // This validator does not check shape. So checking isDynamic() is skipped.
+
+  const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)};
+  const auto input_true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
+  const auto input_false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
+  UNUSED_RELEASE(output_index);
+  UNUSED_RELEASE(input_true_index);
+  UNUSED_RELEASE(input_false_index);
+
+  OP_REQUIRES(_ctx.at(condition_index).typeInfo().type() == ir::DataType::BOOL8);
+}
+
+void ShapeValidator::visit(const ir::operation::StridedSlice &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() <= 4);
+}
+
+void ShapeValidator::visit(const ir::operation::Split &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+  const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
+
+  const auto num_splits = node.param().num_splits;
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+  auto axis = *reinterpret_cast<const int32_t *>(_ctx.at(axis_index).data()->base());
+  axis = axis < 0 ? axis + input_rank : axis;
+
+  OP_REQUIRES(axis >= 0 && axis < input_rank);
+  OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
+}
+
+void ShapeValidator::visit(const ir::operation::Shape &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+  UNUSED_RELEASE(input_index);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1);
+}
+
+void ShapeValidator::visit(const ir::operation::ResizeBilinear &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+  {
+    return;
+  }
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
+}
+
+void ShapeValidator::visit(const ir::operation::Reverse &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
+void ShapeValidator::visit(const ir::operation::If &)
+{
+  // TODO Add to validate with subgraphs
+}
+
+void ShapeValidator::visit(const ir::operation::While &)
+{
+  // This validator does not check shape. So checking isDynamic() is skipped.
+  // TODO Add to validate with subgraphs
+}
+
+void ShapeValidator::visit(const ir::operation::SquaredDifference &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+  // Check for dimension constraints
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  auto output_shape = _ctx.at(output_index).shape();
+  auto lhs_shape = _ctx.at(lhs_index).shape();
+  auto rhs_shape = _ctx.at(rhs_index).shape();
+  // Check for output rank
+  OP_REQUIRES(output_shape.rank() == std::max(lhs_shape.rank(), rhs_shape.rank()));
+  auto min_rank = std::min(lhs_shape.rank(), rhs_shape.rank());
+
+  for (int idx = 1; idx <= min_rank; idx++)
+  {
+    int l_idx = lhs_shape.rank() - idx;
+    int r_idx = rhs_shape.rank() - idx;
+    int out_idx = output_shape.rank() - idx;
+
+    OP_REQUIRES((l_idx >= 0) && (r_idx >= 0) && (out_idx >= 0));
+
+    auto l_dims = lhs_shape.dim(l_idx);
+    auto r_dims = rhs_shape.dim(r_idx);
+    auto out_dims = output_shape.dim(out_idx);
+
+    OP_REQUIRES(((l_dims == r_dims) && (out_dims == l_dims)) ||
+                ((l_dims == 1) && (out_dims == r_dims)) || ((r_dims == 1) && (out_dims == l_dims)));
+  }
+  auto &tmp_shape = (lhs_shape.rank() > rhs_shape.rank()) ? lhs_shape : rhs_shape;
+  for (int idx = min_rank + 1; idx <= output_shape.rank(); idx++)
+  {
+    int out_idx = output_shape.rank() - idx;
+    int tmp_idx = tmp_shape.rank() - idx;
+
+    OP_REQUIRES((out_idx >= 0) && (tmp_idx >= 0) &&
+                (output_shape.dim(out_idx) == tmp_shape.dim(tmp_idx)));
+  }
+}
+void ShapeValidator::visit(const ir::operation::Tile &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+  const auto multiple_index{node.getInputs().at(1)};
+
+  OP_REQUIRES(_ctx.at(multiple_index).shape().rank() == 1);
+  OP_REQUIRES(_ctx.at(multiple_index).shape().dim(0) == _ctx.at(input_index).shape().rank());
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
+}
+
+void ShapeValidator::visit(const ir::operation::Range &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto start_index{node.getInputs().at(ir::operation::Range::Input::START)};
+  const auto limit_index{node.getInputs().at(ir::operation::Range::Input::LIMIT)};
+  const auto delta_index{node.getInputs().at(ir::operation::Range::Input::DELTA)};
+
+  // Check for dimension constraints
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(start_index).shape().rank() == 0);
+  OP_REQUIRES(_ctx.at(limit_index).shape().rank() == 0);
+  OP_REQUIRES(_ctx.at(delta_index).shape().rank() == 0);
+}
+
+void ShapeValidator::visit(const ir::operation::MatrixBandPart &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)};
+  const auto num_lower_index{
+      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)};
+  const auto num_upper_index{
+      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)};
+
+  // Check for dimension constraints
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() >= 2);     // input must be more than 2 dim matrix
+  OP_REQUIRES(_ctx.at(num_upper_index).shape().rank() == 0); // num_lower must be scalar
+  OP_REQUIRES(_ctx.at(num_lower_index).shape().rank() == 0); // num_upper must be scalar
+}
+
+void ShapeValidator::visit(const ir::operation::LogSoftmax &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  const auto input_index{node.getInputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
+
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.h b/runtime/onert/core/src/compiler/ShapeValidator.h
new file mode 100644
index 000000000..f40c098d5
--- /dev/null
+++ b/runtime/onert/core/src/compiler/ShapeValidator.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_SHAPE_VALIDATOR_H__
+#define __ONERT_COMPILER_SHAPE_VALIDATOR_H__
+
+#include "ir/Layout.h"
+#include "ir/OperationVisitor.h"
+
+namespace onert
+{
+namespace ir
+{
+class Graph;
+class Operands;
+} // namespace ir
+} // namespace onert
+
+namespace onert
+{
+namespace compiler
+{
+
+class ShapeValidator : public ir::OperationVisitor
+{
+public:
+  ShapeValidator(void) = delete;
+  ShapeValidator(const ir::Graph &graph);
+
+public:
+  void operator()();
+
+public:
+  void visit(const ir::operation::BatchMatMul &node) override;
+  void visit(const ir::operation::BatchToSpaceND &node) override;
+  void visit(const ir::operation::BCQFullyConnected &node) override;
+  void visit(const ir::operation::BCQGather &node) override;
+  void visit(const ir::operation::Comparison &node) override;
+  void visit(const ir::operation::Softmax &node) override;
+  void visit(const ir::operation::InstanceNorm &node) override;
+  void visit(const ir::operation::Permute &node) override;
+  void visit(const ir::operation::Pool2D &node) override;
+  void visit(const ir::operation::Reduce &node) override;
+  void visit(const ir::operation::Transpose &node) override;
+  void visit(const ir::operation::RNN &node) override;
+  void visit(const ir::operation::SpaceToBatchND &node) override;
+  void visit(const ir::operation::SpaceToDepth &node) override;
+  void visit(const ir::operation::ElementwiseActivation &node) override;
+  void visit(const ir::operation::ElementwiseBinary &node) override;
+  void visit(const ir::operation::ElementwiseUnary &node) override;
+  void visit(const ir::operation::EmbeddingLookup &node) override;
+  void visit(const ir::operation::ExpandDims &node) override;
+  void visit(const ir::operation::HashtableLookup &node) override;
+  void visit(const ir::operation::TransposeConv &node) override;
+  void visit(const ir::operation::Gather &node) override;
+  void visit(const ir::operation::DepthToSpace &node) override;
+  void visit(const ir::operation::Pack &node) override;
+  void visit(const ir::operation::LSTM &node) override;
+  void visit(const ir::operation::L2Normalization &node) override;
+  void visit(const ir::operation::Unpack &node) override;
+  void visit(const ir::operation::Pad &node) override;
+  void visit(const ir::operation::Select &node) override;
+  void visit(const ir::operation::StridedSlice &node) override;
+  void visit(const ir::operation::Split &node) override;
+  void visit(const ir::operation::Shape &node) override;
+  void visit(const ir::operation::ResizeBilinear &node) override;
+  void visit(const ir::operation::Reverse &node) override;
+  void visit(const ir::operation::If &node) override;
+  void visit(const ir::operation::While &node) override;
+  void visit(const ir::operation::SquaredDifference &node) override;
+  void visit(const ir::operation::Tile &node) override;
+  void visit(const ir::operation::Range &node) override;
+  void visit(const ir::operation::MatrixBandPart &node) override;
+  void visit(const ir::operation::LogSoftmax &node) override;
+
+private:
+  void checkUnaryOp(const ir::Operation &node);
+
+private:
+  // TODO Remove _ctx field
+  const ir::Graph &_graph;
+  const ir::Operands &_ctx;
+  ir::Layout _current_op_seq_layout;
+};
+
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_SHAPE_VALIDATOR_H__
diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
index 4eba1ff49..df129d98b 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
@@ -147,16 +147,26 @@ void StaticShapeInferer::visit(const ir::operation::ArgMax &op)
   const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
+  const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto &axis = _operands.at(axis_idx);
+
   // get mutable output operand
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
-  const auto rank = input.info().shape().rank();
-  const auto axis = ((op.param().axis < 0) ? rank + op.param().axis : op.param().axis);
 
-  assert(0 <= axis && axis < rank);
+  if (!axis.isConstant())
+  {
+    output.info().setDynamic();
+    _return_has_dynamic_tensor = true;
+    return;
+  }
+
+  const auto rank = input.info().shape().rank();
+  auto axis_value = axis.asScalar<int32_t>();
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
   // re-sizing output shape
-  ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis, rank);
+  ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis_value, rank);
   output.info().shape(new_shape);
 }
 
@@ -165,13 +175,60 @@ void StaticShapeInferer::visit(const ir::operation::BatchMatMul &op)
   const auto lhs_index = op.getInputs().at(ir::operation::BatchMatMul::Input::LHS);
   const auto rhs_index = op.getInputs().at(ir::operation::BatchMatMul::Input::RHS);
   const auto output_index = op.getOutputs().at(0);
-  const auto lhs = _operands.at(lhs_index);
-  const auto rhs = _operands.at(rhs_index);
+  const auto &lhs = _operands.at(lhs_index);
+  const auto &rhs = _operands.at(rhs_index);
   auto &output = _operands.at(output_index);
   auto new_shape = shape_inference::inferBatchMatMulShape(lhs.shape(), rhs.shape(), op.param());
   output.info().shape(new_shape);
 }
 
+void StaticShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
+{
+  const auto input_idx{op.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
+  const auto &input = _operands.at(input_idx);
+
+  const auto cluster_idx{
+      op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+  const auto &cluster = _operands.at(cluster_idx);
+
+  const auto output_idx = op.getOutputs().at(0);
+  ir::Operand &output = _operands.at(output_idx);
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster.data()->base());
+  assert(cluster_buf);
+
+  // re-sizing output shape
+  ir::Shape new_shape = shape_inference::inferBCQFullyConnectedShape(
+      input.info().shape(), cluster.info().shape(), cluster_buf);
+  output.info().shape(new_shape);
+}
+
+void StaticShapeInferer::visit(const ir::operation::BCQGather &op)
+{
+  const auto indices_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto &indices = _operands.at(indices_idx);
+
+  const auto input_binary_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)};
+  const auto &input_binary = _operands.at(input_binary_idx);
+
+  const auto cluster_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+  const auto &cluster = _operands.at(cluster_idx);
+
+  const auto output_idx = op.getOutputs().at(0);
+  ir::Operand &output = _operands.at(output_idx);
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster.data()->base());
+  assert(cluster_buf);
+
+  auto rank = input_binary.shape().rank();
+
+  // re-sizing output shape
+  ir::Shape new_shape = shape_inference::inferBCQGatherShape(
+      indices.info().shape(), cluster.info().shape(), cluster_buf, rank, op.param());
+
+  output.info().shape(new_shape);
+}
+
 void StaticShapeInferer::visit(const ir::operation::BinaryArithmetic &op)
 {
   handleBinaryArithmeticOp(op, op.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS),
@@ -439,6 +496,98 @@ void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
 }
 
+void StaticShapeInferer::visit(const ir::operation::LSTM &op)
+{
+  const auto output_index{op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+  auto &output = _operands.at(output_index);
+
+  const auto output_state_out_index{
+      op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+
+  const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+
+  const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+
+  if (output.info().isDynamic() || (_operands.exist(output_state_out_index) &&
+                                    _operands.at(output_state_out_index).info().isDynamic()) ||
+      (_operands.exist(cell_state_out_index) &&
+       _operands.at(cell_state_out_index).info().isDynamic()) ||
+      (_operands.exist(scratch_buffer_index) &&
+       _operands.at(scratch_buffer_index).info().isDynamic()))
+    return;
+
+  const auto input_index{op.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto &input = _operands.at(input_index);
+
+  const auto input_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto &input_to_output_weights = _operands.at(input_to_output_weights_index);
+
+  const auto recurrent_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto &recurrent_to_output_weights = _operands.at(recurrent_to_output_weights_index);
+
+  // re-sizing outputs
+  const int n_batch = (input.shape().rank() == 3 && op.param().time_major) ? input.shape().dim(1)
+                                                                           : input.shape().dim(0);
+  const int n_cell = input_to_output_weights.shape().dim(0);
+  const int n_output = recurrent_to_output_weights.shape().dim(1);
+  if (input.shape().rank() == 3)
+  {
+    if (op.param().time_major)
+      output.info().shape(ir::Shape{input.shape().dim(0), n_batch, n_output});
+    else
+      output.info().shape(ir::Shape{n_batch, input.shape().dim(1), n_output});
+  }
+  else
+  {
+    assert(input.shape().rank() == 2);
+    output.info().shape(ir::Shape{n_batch, n_output});
+  }
+
+  if (_operands.exist(output_state_out_index))
+  {
+    auto &output_state_out = _operands.at(output_state_out_index);
+    output_state_out.info().shape(ir::Shape{n_batch, n_output});
+  }
+
+  if (_operands.exist(cell_state_out_index))
+  {
+    auto &cell_state_out = _operands.at(cell_state_out_index);
+    cell_state_out.info().shape(ir::Shape{n_batch, n_cell});
+  }
+
+  if (_operands.exist(scratch_buffer_index))
+  {
+    auto &scratch_buffer = _operands.at(scratch_buffer_index);
+
+    const auto input_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+    const auto recurrent_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+
+    bool has_input_to_input_weights =
+        _operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+        _operands.at(input_to_input_weights_index).shape().dim(1) != 0;
+    bool has_recurrent_to_input_weights =
+        _operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+        _operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+
+    // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
+    // true: no CIFG
+    // false: CIFG
+    bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+    if (has_cifg_param)
+    {
+      scratch_buffer.info().shape(ir::Shape{n_batch, n_cell * 4});
+    }
+    else
+    {
+      scratch_buffer.info().shape(ir::Shape{n_batch, n_cell * 3});
+    }
+  }
+}
+
 void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
@@ -683,9 +832,29 @@ void StaticShapeInferer::visit(const ir::operation::ResizeBilinear &op)
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
 
+  int32_t height_out, width_out;
+  if (op.getInputs().size() == 2)
+  {
+    auto &size = _operands.at(op.getInputs().at(ir::operation::ResizeBilinear::Input::SIZE));
+    if (!size.isConstant())
+    {
+      output.info().setDynamic();
+      _return_has_dynamic_tensor = true;
+      return;
+    }
+    const auto size_v = size.asVector<std::int32_t>();
+    height_out = size_v[0];
+    width_out = size_v[1];
+  }
+  else
+  {
+    height_out = op.param().height_out;
+    width_out = op.param().width_out;
+  }
+
   // Shape inferencing logic based on Params
-  ir::Shape new_shape = shape_inference::inferResizeBilinearShape(
-      input.shape(), op.param().height_out, op.param().width_out);
+  ir::Shape new_shape =
+      shape_inference::inferResizeBilinearShape(input.shape(), height_out, width_out);
 
   // if size_op is from Const, TFLC put the shape of output into tensor
   if (new_shape != output.shape())
@@ -803,21 +972,35 @@ void StaticShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
 
 void StaticShapeInferer::visit(const ir::operation::Split &op)
 {
-  const auto input_idx{op.getInputs().at(0)};
+  const auto input_idx{op.getInputs().at(ir::operation::Split::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
-  const auto axis = op.param().axis;
+  const auto axis_idx{op.getInputs().at(ir::operation::Split::Input::AXIS)};
+  const auto &axis = _operands.at(axis_idx);
+
+  auto outputs = op.getOutputs();
+  if (!axis.isConstant())
+  {
+    for (auto output_idx : outputs)
+    {
+      ir::Operand &output = _operands.at(output_idx);
+      output.info().setDynamic();
+    }
+    _return_has_dynamic_tensor = true;
+    return;
+  }
+
   const auto num_splits = op.param().num_splits;
 
   const auto rank = input.info().shape().rank();
-  auto axis_resolved = axis < 0 ? axis + rank : axis;
+  auto axis_value = axis.asScalar<int32_t>();
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  assert(0 <= axis_resolved && axis_resolved < rank);
+  assert(0 <= axis_value && axis_value < rank);
 
   ir::Shape new_shape =
-      shape_inference::inferSplitShape(input.info().shape(), axis_resolved, num_splits);
-  auto output_tensors = op.getOutputs();
-  for (auto output_idx : output_tensors)
+      shape_inference::inferSplitShape(input.info().shape(), axis_value, num_splits);
+  for (auto output_idx : outputs)
   {
     ir::Operand &output = _operands.at(output_idx);
     output.info().shape(new_shape);
@@ -838,13 +1021,6 @@ void StaticShapeInferer::visit(const ir::operation::Squeeze &op)
   const auto output_idx = op.getOutputs().at(0);
   ir::Operand &output = _operands.at(output_idx);
 
-  if (input.info().isDynamic())
-  {
-    output.info().setDynamic();
-    _return_has_dynamic_tensor = true;
-    return;
-  }
-
   // Squeeze output shpae
   ir::Shape new_shape = shape_inference::inferSqueezeShape(input.info().shape(), op.param());
   output.info().shape(new_shape);
@@ -909,7 +1085,8 @@ void StaticShapeInferer::visit(const ir::operation::Tile &op)
   assert(multiplier_buffer);
 
   // re-sizing output shape
-  auto new_shape = shape_inference::inferTileShape(input.info().shape(), multiplier_buffer);
+  auto new_shape = shape_inference::inferTileShape(input.info().shape(), multiplier_buffer,
+                                                   multiplier.shape().num_elements());
   output.info().shape(new_shape);
 }
 
@@ -918,14 +1095,43 @@ void StaticShapeInferer::visit(const ir::operation::Transpose &op)
   const auto input_idx{op.getInputs().at(ir::operation::Transpose::Input::INPUT)};
   const auto &input = _operands.at(input_idx);
 
+  const auto perm_idx{op.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
+  const auto &perm = _operands.at(perm_idx);
+
+  // perm.shape() != ir::Shape{0} means that perm is (n-1...0)
+  // TODO This condition changes to perm.num_elements() == 0
+  const auto is_regular_transpose = perm.shape() == ir::Shape{0};
+
   // get mutable output operand
   const auto output_idx = op.getOutputs().at(0);
-  ir::Operand &output = _operands.at(output_idx);
-  const auto perm{op.param().perm};
-  // const auto rank{op.param().rank};
+  auto &output = _operands.at(output_idx);
+  if (!perm.isConstant() && !is_regular_transpose)
+  {
+    output.info().setDynamic();
+    _return_has_dynamic_tensor = true;
+    return;
+  }
 
-  // set output shape, based on input and params
-  ir::Shape new_shape = shape_inference::inferTransposeShape(input.info().shape(), perm);
+  ir::Shape new_shape;
+  if (is_regular_transpose)
+  {
+    // Call by (n-1...0)
+    new_shape = shape_inference::inferTransposeShape(input.info().shape(), nullptr, 0);
+  }
+  else
+  {
+    // Check rank
+    if (input.info().shape().rank() != static_cast<int>(perm.info().shape().num_elements()))
+    {
+      throw std::runtime_error("StaticShapeInferer failed, bad rank size: " +
+                               std::to_string(perm.info().shape().num_elements()));
+    }
+
+    // set output shape, based on input and params
+    const auto perm_buf = reinterpret_cast<const int32_t *>(perm.data()->base());
+    new_shape = shape_inference::inferTransposeShape(input.info().shape(), perm_buf,
+                                                     perm.shape().num_elements());
+  }
   output.info().shape(new_shape);
 }
 
diff --git a/runtime/onert/core/src/compiler/TensorRegistries.h b/runtime/onert/core/src/compiler/TensorRegistries.h
index 8be87b081..e42225cbf 100644
--- a/runtime/onert/core/src/compiler/TensorRegistries.h
+++ b/runtime/onert/core/src/compiler/TensorRegistries.h
@@ -69,7 +69,7 @@ public:
     return _cf_tensor_reg;
   }
 
-  std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind) const
+  backend::ITensor *getITensor(ir::OperandIndex ind) const
   {
     for (auto &tensor_reg : _tensor_regs)
     {
diff --git a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
index 647669e46..ef6240894 100644
--- a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
+++ b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
@@ -44,7 +44,7 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
       const auto key = ReplaceKey{input, factor};
       if (_replace_operands_map.count(key) == 0)
       {
-        auto new_object = object;
+        ir::Operand new_object(object);
         new_object.unsetDef();
         // TODO Remove const_case
         const_cast<ir::OperationIndexSet &>(new_object.getUses()).clear();
@@ -81,7 +81,7 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
   }
 
   // Now this runtime does not support the node making output as constant
-  for (const auto &output : node.getOutputs())
+  for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
   {
     UNUSED_RELEASE(output);
     assert(!_graph.operands().at(output).isConstant());
diff --git a/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc
new file mode 100644
index 000000000..c176f6ffb
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConstantOutputPass.h"
+
+#include "ir/Graph.h"
+#include "ir/operation/Permute.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+void ConstantOutputPass::callback(const ir::OperandIndex &ind, ir::Operand &obj)
+{
+  if (!_graph.getOutputs().contains(ind) || !obj.isConstant())
+    return;
+
+  auto permute_input_ind = _graph.addOperand(obj.shape(), obj.typeInfo());
+  auto &permute_input_obj = _graph.operands().at(permute_input_ind);
+
+  // Move the const data
+  permute_input_obj.data(obj.shareData());
+  obj.releaseData();
+  obj.info().setAsNonConst();
+
+  using ir::operation::Permute;
+  auto permute_obj = std::make_unique<Permute>(permute_input_ind, ind, Permute::Type::COPY);
+  auto permute_ind = _graph.operations().push(std::move(permute_obj));
+
+  permute_input_obj.insertUse(permute_ind);
+  obj.setDef(permute_ind);
+
+  // Make the operations that uses this operand to use the generated operand
+  auto orig_uses = obj.getUses();
+  for (auto use : orig_uses)
+  {
+    permute_input_obj.insertUse(use);
+    obj.removeUse(use);
+    _graph.operations().at(use).replaceInputs(ind, permute_input_ind);
+  }
+
+  VERBOSE(ConstantOutputPass) << "Permute Op inserted for a constant ouput, node index : "
+                              << permute_ind << std::endl;
+  VERBOSE(ConstantOutputPass) << "  - Input (inserted) Operand : " << permute_input_ind
+                              << std::endl;
+  VERBOSE(ConstantOutputPass) << "  - Output(original) Operand : " << ind << std::endl;
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h
new file mode 100644
index 000000000..193dd3a68
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_PASS_CONSTANT_OUTPUT_PASS_H__
+#define __ONERT_COMPILER_PASS_CONSTANT_OUTPUT_PASS_H__
+
+#include "OperandPass.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief Pass to specially handle constant model outputs
+ *
+ * As an output buffer is given right before an execution but constant initialization is done at
+ * prepare phase, the current runtime structure cannot handle when an output is constant.
+ * To resolve this problem, this pass inserts a Permute layer with a const input and make the model
+ * output tensor to be its output.
+ *
+ * e.g.)
+ *
+ * ((Const Output))
+ *
+ * becomes
+ *
+ * (Const) -> [Permute] -> ((Output))
+ *
+ * Note that this is a mandatory pass for Graph.
+ */
+class ConstantOutputPass : public OperandPass
+{
+public:
+  using OperandPass::OperandPass;
+
+public:
+  std::string id() final { return "ConstantOutputPass"; }
+
+public:
+  void callback(const ir::OperandIndex &i, ir::Operand &o) final;
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_CONSTANT_INSERTION_PASS_H__
diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
new file mode 100644
index 000000000..f50fae0d3
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OddOutputPass.h"
+
+#include "ir/Graph.h"
+#include "ir/operation/Permute.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+void OddOutputPass::run()
+{
+  auto &outputs = _graph.getOutputs();
+
+  VERBOSE(OddOutputPass) << "Case 1 : An operand which is a model output and a model input"
+                         << std::endl;
+  for (auto &ind : outputs)
+  {
+    if (_graph.getInputs().contains(ind))
+    {
+      auto permute_output_ind = insertPermute(ind);
+      // Update the output to be newly added operand
+      _graph.getOutputs().replace(ind, permute_output_ind);
+    }
+  }
+
+  VERBOSE(OddOutputPass) << "Case 2 : Two or more duplicated outputs" << std::endl;
+  std::unordered_set<ir::OperandIndex> occurence;
+  for (auto &ind : outputs)
+  {
+    auto &obj = _graph.operands().at(ind);
+    if (occurence.count(ind) == 0)
+    {
+      occurence.insert(ind);
+      continue;
+    }
+
+    // Panic when it is const, it must have been handled earlier in another pass
+    UNUSED_RELEASE(obj);
+    assert(!obj.isConstant());
+
+    auto permute_output_ind = insertPermute(ind);
+    ind = permute_output_ind; // Replace output index to fix output duplication
+  }
+}
+
+ir::OperandIndex OddOutputPass::insertPermute(ir::OperandIndex ind)
+{
+  auto &obj = _graph.operands().at(ind);
+  auto output_ind = _graph.addOperand(obj.shape(), obj.typeInfo());
+  auto &output_obj = _graph.operands().at(output_ind);
+
+  using ir::operation::Permute;
+  auto permute_obj = std::make_unique<Permute>(ind, output_ind, Permute::Type::COPY);
+  auto permute_ind = _graph.operations().push(std::move(permute_obj));
+
+  output_obj.setDef(permute_ind);
+  obj.insertUse(permute_ind);
+
+  VERBOSE(OddOutputPass) << "Permute Op inserted for a constant output, node index : "
+                         << permute_ind << std::endl;
+  VERBOSE(OddOutputPass) << "  - Input (original) Operand : " << ind << std::endl;
+  VERBOSE(OddOutputPass) << "  - Output(inserted) Operand : " << output_ind << std::endl;
+
+  return output_ind;
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.h b/runtime/onert/core/src/compiler/pass/OddOutputPass.h
new file mode 100644
index 000000000..2accbac60
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__
+#define __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__
+
+#include <unordered_set>
+
+#include "Pass.h"
+#include "ir/Index.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief Pass to specially handle odd outputs in a subgraph
+ *
+ * Runtime Graph IR requires every input or output must have distinct tensor index, this is onert's
+ * restriction. However we allow duplication of indices in the models(or API). So we should
+ * transform the graph after model-loading.
+ *
+ * This is necessary since our API lets users to set different buffers for each input and output so
+ * it is unavoidable that we must copy the value at runtime.
+ *
+ * Note that this is a mandatory pass for Graph.
+ *
+ * Case 1 : An operand which is a model output and a model input
+ *
+ * Create an operand and insert a Permute(copy) op between them. And change the output to be the
+ * newly generated operand.
+ *
+ * e.g.)
+ *
+ * ```
+ * ((#0 Input0 and also Output0))
+ * becomes
+ * ((#0 Input0)) -> [#0 Permute] -> ((#1 Output0))
+ * ```
+ *
+ * Case 2 : Two or more duplicated outputs
+ *
+ * Do the same with Case 1, but between two outputs of the same tensor index.
+ *
+ * e.g.)
+ *
+ * ```
+ * ((#0 Input0)) -> [#0 Some Operation] -> ((#1 Output0 and also Output1))
+ * becomes
+ * ((#0 Input0)) -> [#0 Some Operation] -> ((#1 Output0)) [#1 Permute] -> ((#2 Output1))
+ * ```
+ *
+ */
+class OddOutputPass : public Pass
+{
+public:
+  using Pass::Pass;
+
+public:
+  std::string id() final { return "OddOutputPass"; }
+
+public:
+  void run() override;
+
+private:
+  ir::OperandIndex insertPermute(ir::OperandIndex input);
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__
diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.cc b/runtime/onert/core/src/compiler/pass/PassRunner.cc
new file mode 100644
index 000000000..2a058c8ac
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/PassRunner.cc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PassRunner.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+PassRunner &PassRunner::append(std::unique_ptr<Pass> pass)
+{
+  _passes.emplace_back(std::move(pass));
+  return *this;
+}
+
+void PassRunner::run()
+{
+  for (auto &pass : _passes)
+  {
+    VERBOSE(PassRunner) << "Start running '" << pass->id() << "'" << std::endl;
+    pass->run();
+    VERBOSE(PassRunner) << "Finished running '" << pass->id() << "'" << std::endl;
+    // TODO Dump graph(LowerInfo, OpSequence, ...)?
+  }
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.h b/runtime/onert/core/src/compiler/pass/PassRunner.h
new file mode 100644
index 000000000..a43c83f89
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/PassRunner.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_PASS_PASS_RUNNER_H__
+#define __ONERT_COMPILER_PASS_PASS_RUNNER_H__
+
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include "Pass.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief Composite passes with logging
+ */
+class PassRunner
+{
+public:
+  PassRunner() = default;
+  PassRunner &append(std::unique_ptr<Pass> pass);
+
+  void run();
+
+private:
+  std::vector<std::unique_ptr<Pass>> _passes;
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_PASS_RUNNER_H__
diff --git a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
index f01697034..504f1b995 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
@@ -53,6 +53,20 @@ void PermutationEliminationPass::visit(const ir::operation::Permute &node)
 
   if (_graph.getOutputs().contains(out_operand))
   {
+    // If the input is a const, we cannot remove it since we cannot put the constant data in the
+    // output buffer during prepare phase.
+    auto permute_input = node.getInputs().at(0);
+    if (_graph.operands().at(permute_input).isConstant())
+      return;
+    // If the input is a model input, we cannot remove it since our API lets users to set different
+    // buffers for inputs and outputs even though one tensor is both at the same time.
+    auto permute_output = node.getOutputs().at(0);
+    if (_graph.getInputs().contains(permute_input) && _graph.getOutputs().contains(permute_output))
+      return;
+    // Likewise, if copying between outputs to outputs, keep it.
+    if (_graph.getOutputs().contains(permute_input) && _graph.getOutputs().contains(permute_output))
+      return;
+
     // Exceptional case : When the output operand is a model output
     // In this case we keep the output and remove the input
 
diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
index c5c95c726..93d125307 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
@@ -212,7 +212,7 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
       }
     }
 
-    for (const auto &output : node.getOutputs() | Remove::DUPLICATED)
+    for (const auto &output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       auto lower_info = _lowered_graph.getLowerInfo(output);
       lower_info->removeDefPermuteFactor(removed_factor);
@@ -279,6 +279,18 @@ void PermutationOperationPass::visit(const ir::operation::Gather &node)
   }
 }
 
+void PermutationOperationPass::visit(const ir::operation::OneHot &node)
+{
+  const auto &output_ind = node.getOutputs().at(0);
+  const auto &output_obj = _graph.operands().at(output_ind);
+  const auto &output_shape = output_obj.shape();
+
+  if (output_shape.rank() >= 4)
+  {
+    changeToKeepLayout(node);
+  }
+}
+
 void PermutationOperationPass::visit(const ir::operation::Pack &node)
 {
   const auto &input_ind = node.getInputs().at(ir::operation::Reshape::Input::INPUT);
diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h
index 2dd76b971..cea5de288 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h
+++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h
@@ -44,6 +44,7 @@ public:
   void visit(const ir::operation::Concat &) final;
   void visit(const ir::operation::ElementwiseBinary &) final;
   void visit(const ir::operation::ElementwiseUnary &) final;
+  void visit(const ir::operation::OneHot &) final;
   void visit(const ir::operation::Pack &) final;
   void visit(const ir::operation::PReLU &) final;
   void visit(const ir::operation::SquaredDifference &) final;
diff --git a/runtime/onert/core/src/dumper/dot/DotDumper.cc b/runtime/onert/core/src/dumper/dot/DotDumper.cc
index 118057f09..8f3cf328c 100644
--- a/runtime/onert/core/src/dumper/dot/DotDumper.cc
+++ b/runtime/onert/core/src/dumper/dot/DotDumper.cc
@@ -81,11 +81,8 @@ void DotDumper::dump(const std::string &tag)
     }
     else
     {
-      showing_cond = !object.isConstant();
-    }
-    if (object.isConstant() || _graph.getInputs().contains(index))
-    {
-      showing_cond = showing_cond && (object.getUses().size() > 0);
+      showing_cond =
+          !object.isConstant() || (_graph.getInputs() + _graph.getOutputs()).contains(index);
     }
     if (showing_cond)
     {
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.cc b/runtime/onert/core/src/exec/DataflowExecutor.cc
index a69ae9cdb..53bc3c204 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.cc
+++ b/runtime/onert/core/src/exec/DataflowExecutor.cc
@@ -77,14 +77,12 @@ bool DataflowExecutor::noWaitingJobs()
                      [](const std::unique_ptr<Job> &job) { return job == nullptr; });
 }
 
-DataflowExecutor::DataflowExecutor(
-    std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-    const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-    const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-    const compiler::TensorRegistries &tensor_regs, backend::TensorManagerSet &&tensor_mgrs,
-    compiler::CodeMap &&code_map)
-    : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
-                   std::move(tensor_mgrs)},
+DataflowExecutor::DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                   const std::vector<backend::ITensor *> &input_tensors,
+                                   const std::vector<backend::ITensor *> &output_tensors,
+                                   const compiler::TensorRegistries &tensor_regs,
+                                   compiler::CodeMap &&code_map)
+    : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs},
       _code_map{std::move(code_map)}
 {
   VERBOSE(DataflowExecutor) << "Constructing Dataflow Executor" << std::endl;
@@ -161,6 +159,8 @@ void DataflowExecutor::executeImpl()
 
     _subject.notifyJobBegin(this, op_seq, backend);
 
+    job->fn_seq()->initRunning();
+
     // check if FunctionSequence needs to handle dynamic tensor
     bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists;
     job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor);
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.h b/runtime/onert/core/src/exec/DataflowExecutor.h
index 8d60e3e4b..69dfda15c 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.h
+++ b/runtime/onert/core/src/exec/DataflowExecutor.h
@@ -50,10 +50,9 @@ public:
    * @param code_map OpSequence and its code map
    */
   DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                   const compiler::TensorRegistries &tensor_regs,
-                   backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map);
+                   const std::vector<backend::ITensor *> &input_tensors,
+                   const std::vector<backend::ITensor *> &output_tensors,
+                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map);
 
   void executeImpl() override;
 
diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
index 70bddfce4..0f604c43f 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
@@ -23,14 +23,6 @@ namespace onert
 namespace exec
 {
 
-inline backend::IDynamicTensorManager *
-dynamicTensorManagerOf(const std::shared_ptr<backend::ITensor> &tensor)
-{
-  if (!tensor->dynamic_tensor_manager())
-    throw std::runtime_error{"Dynamic Tensor Manager is not available for this tensor."};
-  return tensor->dynamic_tensor_manager();
-}
-
 void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
                                                    const ir::OperandIndex lhs_idx,
                                                    const ir::OperandIndex rhs_idx)
@@ -64,7 +56,7 @@ void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
 
   ir::Shape new_shape = shape_inference::inferEltwiseShape(lhs_shape, rhs_shape);
 
-  dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -96,30 +88,32 @@ void DynamicShapeInferer::handleSimpleUnaryOp(const ir::Operation &op,
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::ArgMax &op)
 {
   const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
-  const auto &input = _tensor_registry->getITensor(input_idx);
-  auto input_shape = input->getShape();
+  const auto input = _tensor_registry->getITensor(input_idx);
+
+  const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
+  const auto axis = _tensor_registry->getITensor(axis_idx);
+
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
 
   if (!input->is_dynamic())
     return;
 
+  auto input_shape = input->getShape();
+  auto axis_value = *reinterpret_cast<const int32_t *>(axis->buffer());
   const auto rank = input_shape.rank();
-  const auto axis = ((op.param().axis < 0) ? rank + op.param().axis : op.param().axis);
-
-  assert(0 <= axis && axis < rank);
-
-  auto output_ind = op.getOutputs().at(0);
-  auto output = _tensor_registry->getITensor(output_ind);
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis, rank);
+  ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis_value, rank);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -141,7 +135,68 @@ void DynamicShapeInferer::visit(const ir::operation::BatchMatMul &op)
   // TODO
 
   auto new_shape = shape_inference::inferBatchMatMulShape(lhs_shape, rhs_shape, op.param());
-  dynamicTensorManagerOf(output)->applyShape(output_index, new_shape);
+  output->applyShape(new_shape);
+}
+
+void DynamicShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
+{
+  const auto input_idx{op.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
+  const auto &input = _tensor_registry->getITensor(input_idx);
+
+  const auto cluster_idx{
+      op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+  const auto &cluster = _tensor_registry->getITensor(cluster_idx);
+  assert(cluster->is_constant());
+
+  if (!input->is_dynamic())
+    return;
+
+  auto input_shape = input->getShape();
+  auto cluster_shape = cluster->getShape();
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster->buffer());
+  assert(cluster_buf);
+
+  ir::Shape new_shape =
+      shape_inference::inferBCQFullyConnectedShape(input_shape, cluster_shape, cluster_buf);
+
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
+
+  output->applyShape(new_shape);
+  assert(output->buffer() != nullptr);
+}
+
+void DynamicShapeInferer::visit(const ir::operation::BCQGather &op)
+{
+  const auto indices_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto &indices = _tensor_registry->getITensor(indices_idx);
+
+  const auto input_binary_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)};
+  const auto &input_binary = _tensor_registry->getITensor(input_binary_idx);
+
+  const auto cluster_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+  const auto &cluster = _tensor_registry->getITensor(cluster_idx);
+  assert(cluster->is_constant());
+
+  if (!indices->is_dynamic())
+    return;
+
+  auto indices_shape = indices->getShape();
+  auto cluster_shape = cluster->getShape();
+  auto rank = input_binary->getShape().rank();
+
+  auto cluster_buf = reinterpret_cast<const int32_t *>(cluster->buffer());
+  assert(cluster_buf);
+
+  ir::Shape new_shape = shape_inference::inferBCQGatherShape(indices_shape, cluster_shape,
+                                                             cluster_buf, rank, op.param());
+
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
+
+  output->applyShape(new_shape);
+  assert(output->buffer() != nullptr);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::BinaryArithmetic &op)
@@ -170,7 +225,7 @@ void DynamicShapeInferer::visit(const ir::operation::BroadcastTo &op)
       shape->getShape(), reinterpret_cast<const int32_t *>(shape->buffer()));
 
   // set output shape and output buffer
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -236,7 +291,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op)
     for (auto input_ind : op.getInputs())
     {
       auto input = _tensor_registry->getITensor(input_ind);
-      if (input != first_input && !isConcatible(first_input.get(), input.get(), op.param().axis))
+      if (input != first_input && !isConcatible(first_input, input, op.param().axis))
         throw std::runtime_error("input shapes does not matched for concat");
     }
   }
@@ -255,7 +310,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op)
   auto output = _tensor_registry->getITensor(output_ind);
   auto output_shape = shape_inference::inferConcatShape(in_shapes, op.param());
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::Conv2D &op)
@@ -278,7 +333,7 @@ void DynamicShapeInferer::visit(const ir::operation::Conv2D &op)
 
   ir::Shape output_shape = shape_inference::inferConv2DShape(input_shape, ker_shape, op.param());
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -338,7 +393,7 @@ void DynamicShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_buf[0]);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -354,14 +409,14 @@ void DynamicShapeInferer::visit(const ir::operation::Fill &op)
   if ((!input->is_dynamic()) && (!output->is_dynamic()))
     return;
 
-  assert(input.get()->data_type() == ir::DataType::INT32);
+  assert(input->data_type() == ir::DataType::INT32);
 
   auto input_buf = reinterpret_cast<const int32_t *>(input->buffer());
   assert(input_buf);
 
   auto output_shape = shape_inference::inferFillShape(input_shape, input_buf);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -384,7 +439,7 @@ void DynamicShapeInferer::visit(const ir::operation::FullyConnected &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -416,7 +471,7 @@ void DynamicShapeInferer::visit(const ir::operation::Gather &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -425,6 +480,109 @@ void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
 }
 
+void DynamicShapeInferer::visit(const ir::operation::LSTM &op)
+{
+  const auto output_index{op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+  auto output = _tensor_registry->getITensor(output_index);
+
+  const auto output_state_out_index{
+      op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+
+  const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+
+  const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+
+  if (!output->is_dynamic() &&
+      !(_tensor_registry->getITensor(output_state_out_index) != nullptr &&
+        _tensor_registry->getITensor(output_state_out_index)->is_dynamic()) &&
+      !(_tensor_registry->getITensor(cell_state_out_index) != nullptr &&
+        _tensor_registry->getITensor(cell_state_out_index)->is_dynamic()) &&
+      !(_tensor_registry->getITensor(scratch_buffer_index) != nullptr &&
+        _tensor_registry->getITensor(cell_state_out_index)->is_dynamic()))
+    return;
+
+  const auto input_index{op.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto input = _tensor_registry->getITensor(input_index);
+  const auto input_shape = input->getShape();
+
+  const auto input_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto input_to_output_weights = _tensor_registry->getITensor(input_to_output_weights_index);
+  const auto input_to_output_weights_shape = input_to_output_weights->getShape();
+
+  const auto recurrent_to_output_weights_index{
+      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_output_weights =
+      _tensor_registry->getITensor(recurrent_to_output_weights_index);
+  const auto recurrent_to_output_weights_shape = recurrent_to_output_weights->getShape();
+
+  // re-sizing outputs
+  const int n_batch =
+      (input_shape.rank() == 3 && op.param().time_major) ? input_shape.dim(1) : input_shape.dim(0);
+  const int n_cell = input_to_output_weights_shape.dim(0);
+  const int n_output = recurrent_to_output_weights_shape.dim(1);
+  if (input_shape.rank() == 3)
+  {
+    if (op.param().time_major)
+      output->applyShape(ir::Shape{input_shape.dim(0), n_batch, n_output});
+    else
+      output->applyShape(ir::Shape{n_batch, input_shape.dim(1), n_output});
+  }
+  else
+  {
+    assert(input_shape.rank() == 2);
+    output->applyShape(ir::Shape{n_batch, n_output});
+  }
+  assert(output->buffer() != nullptr);
+
+  auto output_state_out = _tensor_registry->getITensor(output_state_out_index);
+  if (output_state_out != nullptr)
+  {
+    output_state_out->applyShape(ir::Shape{n_batch, n_output});
+    assert(output_state_out->buffer() != nullptr);
+  }
+
+  auto cell_state_out = _tensor_registry->getITensor(cell_state_out_index);
+  if (cell_state_out != nullptr)
+  {
+    cell_state_out->applyShape(ir::Shape{n_batch, n_cell});
+    assert(cell_state_out->buffer() != nullptr);
+  }
+
+  auto scratch_buffer = _tensor_registry->getITensor(scratch_buffer_index);
+  if (scratch_buffer != nullptr)
+  {
+    const auto input_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+    const auto recurrent_to_input_weights_index{
+        op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+
+    const auto input_to_input_weights_shape =
+        _tensor_registry->getITensor(input_to_input_weights_index)->getShape();
+    bool has_input_to_input_weights =
+        input_to_input_weights_shape.dim(0) != 0 && input_to_input_weights_shape.dim(1) != 0;
+
+    const auto recurrent_to_input_weights_shape =
+        _tensor_registry->getITensor(recurrent_to_input_weights_index)->getShape();
+    bool has_recurrent_to_input_weights = recurrent_to_input_weights_shape.dim(0) != 0 &&
+                                          recurrent_to_input_weights_shape.dim(1) != 0;
+
+    // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
+    // true: no CIFG
+    // false: CIFG
+    bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+    if (has_cifg_param)
+    {
+      scratch_buffer->applyShape(ir::Shape{n_batch, n_cell * 4});
+    }
+    else
+    {
+      scratch_buffer->applyShape(ir::Shape{n_batch, n_cell * 3});
+    }
+    assert(scratch_buffer->buffer() != nullptr);
+  }
+}
+
 void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
@@ -452,7 +610,7 @@ void DynamicShapeInferer::visit(const ir::operation::OneHot &op)
   const auto axis_val = op.param().axis;
 
   ir::Shape new_shape = shape_inference::inferOnehotShape(indices_shape, *depth_buf, axis_val);
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -488,7 +646,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pack &op)
 
   ir::Shape new_shape = shape_inference::inferPackShape(input_shape, axis, rank, num);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -515,7 +673,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pad &op)
       shape_inference::inferPadShape(input->getShape(), pad_buf, pad->getShape().num_elements());
 
   // change output shape and reallocate output tensor memory
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -567,7 +725,7 @@ void DynamicShapeInferer::visit(const ir::operation::Range &op)
         *reinterpret_cast<int32_t *>(limit_tensor->buffer()),
         *reinterpret_cast<int32_t *>(delta_tensor->buffer()));
   }
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -611,7 +769,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reduce &op)
 
   ir::Shape new_shape = shape_inference::inferReduceShape(input_shape, axes_vec, keep_dims);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -665,7 +823,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     if (output_shape != output->getShape() || output->buffer() == nullptr)
     {
       // change on output shape
-      dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+      output->applyShape(output_shape);
     }
     assert(output->buffer() != nullptr);
   }
@@ -681,7 +839,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     if (output_shape != output->getShape() || output->buffer() == nullptr)
     {
       // change on output shape
-      dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+      output->applyShape(output_shape);
     }
     assert(output->buffer() != nullptr);
   }
@@ -705,14 +863,35 @@ void DynamicShapeInferer::visit(const ir::operation::ResizeBilinear &op)
     return;
 
   // getting output shape from input shape and Params
-  auto output_shape = shape_inference::inferResizeBilinearShape(
-      input->getShape(), op.param().height_out, op.param().width_out);
+  int32_t height_out, width_out;
+  if (op.getInputs().size() == 2)
+  {
+    auto size_ind = op.getInputs().at(ir::operation::ResizeBilinear::Input::SIZE);
+    auto size = _tensor_registry->getITensor(size_ind);
+    if (size->data_type() == ir::DataType::INT32)
+    {
+      auto size_buf = reinterpret_cast<const int32_t *>(size->buffer());
+      height_out = size_buf[0];
+      width_out = size_buf[1];
+    }
+    else
+    {
+      throw std::runtime_error("DynamicShapeInferer ResizeBilinear : Unsupported data type");
+    }
+  }
+  else
+  {
+    height_out = op.param().height_out;
+    width_out = op.param().width_out;
+  }
+  auto output_shape =
+      shape_inference::inferResizeBilinearShape(input->getShape(), height_out, width_out);
 
   // if shape is changed, change output shape and reallocate output tensor memory
   if (output_shape != output->getShape() || output->buffer() == nullptr)
   {
     // change on output shape
-    dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+    output->applyShape(output_shape);
   }
   assert(output->buffer() != nullptr);
 }
@@ -749,7 +928,7 @@ void DynamicShapeInferer::visit(const ir::operation::Select &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -768,7 +947,7 @@ void DynamicShapeInferer::visit(const ir::operation::Shape &op)
   ir::Shape output_shape;
   output_shape.append(input_shape.rank());
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -794,7 +973,7 @@ void DynamicShapeInferer::visit(const ir::operation::Slice &op)
 
   ir::Shape new_shape = shape_inference::inferSliceShape(input_shape, begins_buf, sizes_buf);
 
-  dynamicTensorManagerOf(output)->applyShape(output_index, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -831,7 +1010,7 @@ void DynamicShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
   ir::Shape new_shape = shape_inference::inferSpaceToBatchNDShape(
       input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
 
-  dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -840,27 +1019,37 @@ void DynamicShapeInferer::visit(const ir::operation::Split &op)
   const auto input_idx{op.getInputs().at(ir::operation::Split::Input::INPUT)};
   const auto &input = _tensor_registry->getITensor(input_idx);
 
-  if (!input->is_dynamic())
+  // Return if all tensors are not dynamic
+  bool has_dynamic = false;
+  for (const auto output_idx : op.getOutputs())
+  {
+    auto output = _tensor_registry->getITensor(output_idx);
+    has_dynamic |= output->is_dynamic();
+  }
+  if (!input->is_dynamic() && !has_dynamic)
   {
     return;
   }
 
   auto input_shape = input->getShape();
 
-  const auto axis = op.param().axis;
+  const auto axis_idx{op.getInputs().at(ir::operation::Split::Input::AXIS)};
+  const auto &axis = _tensor_registry->getITensor(axis_idx);
+
+  auto axis_value = *reinterpret_cast<const int32_t *>(axis->buffer());
   const auto num_splits = op.param().num_splits;
   const auto rank = input_shape.rank();
-  auto axis_resolved = axis < 0 ? axis + rank : axis;
+  axis_value = axis_value < 0 ? axis_value + rank : axis_value;
 
-  assert(0 <= axis_resolved && axis_resolved < rank);
+  assert(0 <= axis_value && axis_value < rank);
 
-  ir::Shape new_shape = shape_inference::inferSplitShape(input_shape, axis_resolved, num_splits);
+  ir::Shape new_shape = shape_inference::inferSplitShape(input_shape, axis_value, num_splits);
   for (int out_tensor_idx = 0; out_tensor_idx < num_splits; out_tensor_idx++)
   {
     auto output_ind = op.getOutputs().at(out_tensor_idx);
     auto output = _tensor_registry->getITensor(output_ind);
 
-    dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+    output->applyShape(new_shape);
     assert(output->buffer() != nullptr);
   }
 }
@@ -889,7 +1078,7 @@ void DynamicShapeInferer::visit(const ir::operation::Squeeze &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -930,7 +1119,7 @@ void DynamicShapeInferer::visit(const ir::operation::StridedSlice &op)
   ir::Shape output_shape =
       onert::shape_inference::inferStridedSliceShape(input_shape, op_params, rank);
 
-  dynamicTensorManagerOf(output)->applyShape(output_index, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -952,10 +1141,11 @@ void DynamicShapeInferer::visit(const ir::operation::Tile &op)
   auto multiplier_buffer = reinterpret_cast<const int32_t *>(multiplier->buffer());
   assert(multiplier_buffer);
 
-  auto output_shape = shape_inference::inferTileShape(input_shape, multiplier_buffer);
+  auto output_shape =
+      shape_inference::inferTileShape(input_shape, multiplier_buffer, multiplier->dimension(0));
 
   // set output shape and output buffer
-  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
+  output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -967,17 +1157,48 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op)
 
   // from op, access the buffer of second input to read new shape
   auto input_ind = op.getInputs().at(ir::operation::Transpose::Input::INPUT);
-  auto input_tensor = _tensor_registry->getITensor(input_ind);
-  auto input_shape = input_tensor->getShape();
+  auto input = _tensor_registry->getITensor(input_ind);
+  auto input_shape = input->getShape();
 
-  if (!input_tensor->is_dynamic())
+  /*
+    Here, the state after compilation (static shape inference) could be one of the following:
+
+              input       perms             output     execution-time shape inf required
+              ------------------------------------     --------------------------------
+      case 1) static         const          static       X
+      case 2) static       non-const        dynamic      O
+      case 3) dynamic        const          dynamic      O
+      case 4) dynamic      non-const        dynamic      O
+
+    So, only when both input1 and ouput are static, we can skip dynamic shape inference.
+  */
+  if ((!input->is_dynamic()) && (!output->is_dynamic()))
     return;
 
-  const auto perm{op.param().perm};
-  // set output shape, based on input and params
-  ir::Shape new_shape = shape_inference::inferTransposeShape(input_shape, perm);
+  auto perm_ind = op.getInputs().at(ir::operation::Transpose::Input::PERMUTATION);
+  auto perm = _tensor_registry->getITensor(perm_ind);
+
+  ir::Shape new_shape;
+  // TODO Change perm->dimension(0) == 0 to perm->num_elements() == 0
+  if (perm->dimension(0) == 0) // This condition means that perm is (n-1...0)
+  {
+    // Call by (n-1...0)
+    new_shape = shape_inference::inferTransposeShape(input_shape, nullptr, 0);
+  }
+  else
+  {
+    // Check rank
+    if (input->num_dimensions() != perm->getShape().num_elements())
+    {
+      throw std::runtime_error("DynamicShapeInferer failed, bad rank size: " +
+                               std::to_string(perm->getShape().num_elements()));
+    }
 
-  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+    // set output shape, based on input and params
+    const auto perm_buffer = reinterpret_cast<const int32_t *>(perm->buffer());
+    new_shape = shape_inference::inferTransposeShape(input_shape, perm_buffer, perm->dimension(0));
+  }
+  output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -1005,7 +1226,7 @@ void DynamicShapeInferer::visit(const ir::operation::Unpack &op)
     auto output_ind = op.getOutputs().at(out_tensor_idx);
     auto output = _tensor_registry->getITensor(output_ind);
 
-    dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
+    output->applyShape(new_shape);
 
     assert(output->buffer() != nullptr);
   }
diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc
index 7feb3ab68..21fdd9c05 100644
--- a/runtime/onert/core/src/exec/Execution.cc
+++ b/runtime/onert/core/src/exec/Execution.cc
@@ -34,14 +34,13 @@ Execution::Execution(const std::shared_ptr<ExecutorMap> &executors) : _executors
 
 void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_shape)
 {
-  // This should be called BEFORE setInput.
-  if (_io_desc.inputs.at(index.value()) != 0)
-    throw std::runtime_error("Error in calling order");
-
   // This will be used later to set input tensor dynamic
   // Note that 'compiled' model will not be updated with new_shape
   // but new_shape will change model input shape while 'running' the model
   _io_desc.dynamic_input_shapes[index] = new_shape;
+
+  VERBOSE(Execution) << "Model input shape will be changed at the start of execute()"
+                     << "(index: " << index.value() << ")" << std::endl;
 }
 
 // TODO Remove default parameter
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.cc b/runtime/onert/core/src/exec/ExecutionObservers.cc
index 060f874de..5883d9a1c 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservers.cc
@@ -22,6 +22,7 @@
 #include "exec/IExecutor.h"
 #include "misc/polymorphic_downcast.h"
 #include "ir/OpSequence.h"
+#include "util/EventWriter.h"
 
 namespace onert
 {
@@ -70,7 +71,7 @@ void ProfileObserver::handleEnd(IExecutor *exec, const ir::OpSequence *op_seq,
 };
 
 ChromeTracingObserver::ChromeTracingObserver(const std::string &filepath, const ir::Graph &graph)
-    : _ofs{filepath, std::ofstream::out}, _recorder{}, _collector{&_recorder}, _graph{graph}
+    : _base_filepath(filepath), _recorder{}, _collector{&_recorder}, _graph{graph}
 {
 }
 
@@ -78,7 +79,7 @@ ChromeTracingObserver::~ChromeTracingObserver()
 {
   try
   {
-    _recorder.writeToFile(_ofs);
+    EventWriter{_recorder}.writeToFiles(_base_filepath);
   }
   catch (const std::exception &e)
   {
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h
index ac0076ed2..f8c2acca5 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.h
+++ b/runtime/onert/core/src/exec/ExecutionObservers.h
@@ -76,7 +76,7 @@ private:
   static std::string opSequenceTag(const ir::OpSequence *op_seq, const ir::Operations &operations);
 
 private:
-  std::ofstream _ofs;
+  const std::string &_base_filepath;
   EventRecorder _recorder;
   EventCollector _collector;
   const ir::Graph &_graph;
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index f835a9675..018a0bba0 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -27,38 +27,32 @@ namespace exec
 {
 
 ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
-                           const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                           const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                           const compiler::TensorRegistries &tensor_regs,
-                           backend::TensorManagerSet &&tensor_mgrs)
+                           const std::vector<backend::ITensor *> &input_tensors,
+                           const std::vector<backend::ITensor *> &output_tensors,
+                           const compiler::TensorRegistries &tensor_regs)
     : _lowered_graph{std::move(lowered_graph)}, _graph{_lowered_graph->graph()},
-      _input_tensors{input_tensors}, _output_tensors{output_tensors},
-      _tensor_mgrs{std::move(tensor_mgrs)}, _mutex()
+      _input_tensors{input_tensors}, _output_tensors{output_tensors}, _mutex()
 {
   // TODO Fix the way of knowing whether it is primary or not
   bool primary_executor = !(_input_tensors.empty() && _output_tensors.empty());
   if (!primary_executor)
   {
     auto build_input_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) {
-      std::vector<std::shared_ptr<backend::ITensor>> list;
+      std::vector<backend::ITensor *> list;
       for (auto ind : ind_seq)
       {
-        std::shared_ptr<backend::ITensor> tensor = tensor_regs.getITensor(ind);
+        backend::ITensor *tensor = tensor_regs.getITensor(ind);
         assert(tensor != nullptr);
-        DynAllocInfo dyn_alloc_info{ind};
-        _input_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
         list.push_back(tensor);
       }
       return list;
     };
     auto build_output_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) {
-      std::vector<std::shared_ptr<backend::ITensor>> list;
+      std::vector<backend::ITensor *> list;
       for (auto ind : ind_seq)
       {
-        std::shared_ptr<backend::ITensor> tensor = tensor_regs.getITensor(ind);
+        backend::ITensor *tensor = tensor_regs.getITensor(ind);
         assert(tensor != nullptr);
-        DynAllocInfo dyn_alloc_info{ind};
-        _output_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
         list.push_back(tensor);
       }
       return list;
@@ -66,28 +60,9 @@ ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_gra
     _input_tensors = build_input_tensor_list(_graph.getInputs());
     _output_tensors = build_output_tensor_list(_graph.getOutputs());
   }
-  else
-  {
-    assert(input_tensors.size() == _graph.getInputs().size());
-    assert(output_tensors.size() == _graph.getOutputs().size());
-    for (uint32_t i = 0; i < input_tensors.size(); i++)
-    {
-      auto tensor = input_tensors[i];
-      auto ind = _graph.getInputs().at(i);
-      DynAllocInfo dyn_alloc_info{ind};
-      _input_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
-    }
-    for (uint32_t i = 0; i < output_tensors.size(); i++)
-    {
-      auto tensor = output_tensors[i];
-      auto ind = _graph.getOutputs().at(i);
-      DynAllocInfo dyn_alloc_info{ind};
-      _output_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info);
-    }
-  }
 }
 
-void ExecutorBase::execute(const std::vector<std::shared_ptr<backend::ITensor>> &src_tensors,
+void ExecutorBase::execute(const std::vector<backend::ITensor *> &src_tensors,
                            const std::shared_ptr<IPermuteFunction> &pre_fn)
 {
   // For thread-safe, use mutex
@@ -108,22 +83,12 @@ void ExecutorBase::execute(const std::vector<std::shared_ptr<backend::ITensor>>
     // If src_tensor or input_tensor is nullptr, pre_fn does not copy the tensors
     if (src_tensor != nullptr && input_tensor != nullptr)
     {
-      auto dyn_alloc_info = _input_to_dyn_alloc_info.find(_input_tensors[n]);
       const auto orig_input_shape = input_tensor->getShape();
       const auto changed_input_shape =
           convertShape(src_tensor->getShape(), src_tensor->layout(), input_tensor->layout());
       if (orig_input_shape != changed_input_shape)
       {
-        if (dyn_alloc_info == _input_to_dyn_alloc_info.end())
-        {
-          // The input_tensor is a dynamic tensor of backend that doesn't support dynamic tensor
-          throw std::runtime_error("Unknown dim is found at execution time for a backend that "
-                                   "does not support dynamic tensor");
-        }
-        else
-        {
-          input_tensor->set_dynamic();
-        }
+        input_tensor->set_dynamic();
       }
     }
   }
@@ -147,7 +112,7 @@ void ExecutorBase::execute(const IODescription &desc)
   for (uint32_t i = 0; i < _input_tensors.size(); ++i)
   {
     // TODO Remove dynamic_cast
-    auto tensor = std::dynamic_pointer_cast<backend::controlflow::UserTensor>(_input_tensors[i]);
+    auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_input_tensors[i]);
     assert(tensor);
     auto input_shape = desc.dynamic_input_shapes.find(ir::IOIndex{i});
     if (input_shape != desc.dynamic_input_shapes.end())
@@ -155,6 +120,7 @@ void ExecutorBase::execute(const IODescription &desc)
       tensor->set_dynamic();
       tensor->setShape(input_shape->second);
     }
+    // TODO Check if (desc.inputs[i] == nullptr)
     // TODO Better design for ITensor? (we need const_cast as ITensor is writable)
     tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.inputs[i]->buffer)),
                       desc.inputs[i]->size);
@@ -166,12 +132,12 @@ void ExecutorBase::execute(const IODescription &desc)
   for (uint32_t i = 0; i < _output_tensors.size(); ++i)
   {
     // TODO Remove dynamic_cast
-    auto tensor = std::dynamic_pointer_cast<backend::controlflow::UserTensor>(_output_tensors[i]);
+    auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_output_tensors[i]);
     assert(tensor);
     tensor->set_dynamic(); // It can't be resized but shape could change
-    // TODO Better design for ITensor? (we need const_cast as ITensor is writable)
-    tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.outputs[i]->buffer)),
-                      desc.outputs[i]->size);
+    if (desc.outputs[i] == nullptr)
+      throw std::runtime_error{"Output " + std::to_string(i) + "'s buffer is not set."};
+    tensor->setBuffer(static_cast<uint8_t *>(desc.outputs[i]->buffer), desc.outputs[i]->size);
   }
 
   executeImpl();
@@ -218,17 +184,8 @@ void ExecutorBase::handleDynamicInputTensor(ir::IOIndex io_ind, const IODescript
   auto shape_sig_found = desc.dynamic_input_shapes.find(io_ind);
   if (shape_sig_found != desc.dynamic_input_shapes.end())
   {
-    auto dyn_alloc_info = _input_to_dyn_alloc_info.find(_input_tensors[io_ind.value()]);
-    if (dyn_alloc_info == _input_to_dyn_alloc_info.end())
-      throw std::runtime_error("Unknown dim is found at execution time for a backend that "
-                               "does not support dynamic tensor");
-
     auto changed_input_shape = shape_sig_found->second;
-    auto operand_ind = dyn_alloc_info->second.ind;
-
-    auto dyn_tensor_manager = _input_tensors[io_ind.value()]->dynamic_tensor_manager();
-    assert(dyn_tensor_manager);
-    dyn_tensor_manager->applyShape(operand_ind, changed_input_shape);
+    _input_tensors[io_ind.value()]->applyShape(changed_input_shape);
   }
 }
 
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index a13be7dbf..8a6ec9174 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -20,9 +20,7 @@
 #include <mutex>
 
 #include "IPermuteFunction.h"
-#include "Source.h"
 #include "exec/ExecutionObservers.h"
-#include "Sink.h"
 #include "ShapeConverter.h"
 #include "exec/IExecutor.h"
 #include "compiler/LoweredGraph.h"
@@ -51,10 +49,9 @@ public:
    * @param tensor_builders Tensor builders that are currently used
    */
   ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
-               const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-               const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-               const compiler::TensorRegistries &tensor_regs,
-               backend::TensorManagerSet &&tensor_mgrs);
+               const std::vector<backend::ITensor *> &input_tensors,
+               const std::vector<backend::ITensor *> &output_tensors,
+               const compiler::TensorRegistries &tensor_regs);
 
   virtual ~ExecutorBase() = default;
 
@@ -66,7 +63,7 @@ public:
    * @param src_tensor Tensor list that will be copied to input tensors of this
    * @param pre_fn The permutation function that copy from src_tensor to input tensors of this
    */
-  void execute(const std::vector<std::shared_ptr<backend::ITensor>> &src_tensors,
+  void execute(const std::vector<backend::ITensor *> &src_tensors,
                const std::shared_ptr<IPermuteFunction> &pre_fn);
 
   void execute(const IODescription &desc) final;
@@ -81,17 +78,9 @@ public:
 
   void addObserver(std::unique_ptr<IExecutionObserver> ref) { _subject.add(std::move(ref)); };
 
-  const std::vector<std::shared_ptr<backend::ITensor>> &getInputTensors() const
-  {
-    return _input_tensors;
-  }
-
-  const std::vector<std::shared_ptr<backend::ITensor>> &getOutputTensors() const
-  {
-    return _output_tensors;
-  }
+  const std::vector<backend::ITensor *> &getInputTensors() const { return _input_tensors; }
 
-  const DynAllocInfoMap &getInputsDynamicAllocInfo() const { return _input_to_dyn_alloc_info; }
+  const std::vector<backend::ITensor *> &getOutputTensors() const { return _output_tensors; }
 
 protected:
   /**
@@ -104,11 +93,8 @@ protected:
   std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
   std::unique_ptr<compiler::LoweredGraph> _lowered_graph;
   const ir::Graph &_graph;
-  std::vector<std::shared_ptr<backend::ITensor>> _input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> _output_tensors;
-  DynAllocInfoMap _input_to_dyn_alloc_info;
-  DynAllocInfoMap _output_to_dyn_alloc_info;
-  backend::TensorManagerSet _tensor_mgrs;
+  std::vector<backend::ITensor *> _input_tensors;
+  std::vector<backend::ITensor *> _output_tensors;
   std::mutex _mutex;
 
 private:
diff --git a/runtime/onert/core/src/exec/FunctionSequence.cc b/runtime/onert/core/src/exec/FunctionSequence.cc
index fb31f7582..8aefa5eeb 100644
--- a/runtime/onert/core/src/exec/FunctionSequence.cc
+++ b/runtime/onert/core/src/exec/FunctionSequence.cc
@@ -28,9 +28,11 @@ namespace exec
 
 void FunctionSequence::run()
 {
-  // TODO Find out when `_enable_dynamic_shape_inferer` is true but `_dynamic_tensor_ctx` is false
   if (_enable_dynamic_shape_inferer && _dynamic_tensor_ctx)
   {
+    // acl_cl and acl_neon backend don't support dynamic shape.
+    // _dynamic_tensor_ctx is always nullptr for acl_cl and acl_neon
+    // Thus, those two bakends cannot reach here.
     if (_dynamic_tensor_ctx->op_seq->size() != _functions.size())
       throw std::runtime_error("operation and functions should be mapped one by one");
 
@@ -61,11 +63,6 @@ void FunctionSequence::run()
   {
     for (const auto &function : _functions)
     {
-      auto *sub_func_seq = dynamic_cast<FunctionSequence *>(function.get());
-      if (sub_func_seq != nullptr)
-      {
-        sub_func_seq->enableDynamicShapeInferer(false);
-      }
       function->run();
     }
   }
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h
index 6b4d15380..94bc2e436 100644
--- a/runtime/onert/core/src/exec/IPermuteFunction.h
+++ b/runtime/onert/core/src/exec/IPermuteFunction.h
@@ -50,13 +50,13 @@ private:
 public:
   virtual void run() override
   {
-    assert(_src_tensors.size() > 0);
+    // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0)
     assert(_src_tensors.size() == _dst_tensors.size());
     auto src_it = _src_tensors.begin();
     auto dst_it = _dst_tensors.begin();
     while (src_it != _src_tensors.end())
     {
-      const auto src_tensor = *src_it;
+      auto src_tensor = *src_it;
       auto dst_tensor = *dst_it;
       if (src_tensor != dst_tensor)
       {
@@ -101,9 +101,8 @@ public:
   virtual void optimize() = 0;
 
 private:
-  template <class T>
-  void permute(const std::shared_ptr<backend::ITensor> &src, std::shared_ptr<backend::ITensor> &dst,
-               size_t rank)
+  // TODO make src const by proving const access()
+  template <class T> void permute(backend::ITensor *src, backend::ITensor *dst, size_t rank)
   {
     const auto permute_type = [&]() -> PermuteType {
       if (src->layout() == ir::Layout::NHWC && dst->layout() == ir::Layout::NCHW)
@@ -121,127 +120,65 @@ private:
     }();
     auto fn = [&](backend::ITensor &src_tensor) {
       dst->access([&](backend::ITensor &dst_tensor) {
-        auto src_buffer = src_tensor.buffer();
-        auto src_size = src_tensor.total_size();
-        auto dst_buffer = dst_tensor.buffer();
-        if (permute_type == PermuteType::COPY)
+        if (rank == 4 && permute_type != PermuteType::COPY)
         {
-          assert(src_tensor.layout() == dst_tensor.layout());
-          if (!src_tensor.has_padding() && !dst_tensor.has_padding())
+          switch (permute_type)
           {
-            assert(src_size <= dst_tensor.total_size());
-            memcpy(dst_buffer, src_buffer, src_size);
-            return;
-          }
-        }
-        switch (rank)
-        {
-          case 0:
-          case 1:
-          {
-            const int32_t copy_len = dst_tensor.dimension(0);
-
-            memcpy(dst_buffer, src_buffer, copy_len * sizeof(T));
-            break;
-          }
-          case 2:
-          {
-            const int32_t dim_0 = dst_tensor.dimension(0);
-            const int32_t copy_len = dst_tensor.dimension(1);
-
-            for (int32_t i = 0; i < dim_0; ++i)
+            case PermuteType::NHWC_TO_NCHW:
             {
-              ir::Coordinates coords{i, 0};
-              memcpy(dst_buffer + dst_tensor.calcOffset(coords),
-                     src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T));
+              ir::FeatureShape shape;
+              shape.N = dst_tensor.dimension(0);
+              shape.C = dst_tensor.dimension(1);
+              shape.H = dst_tensor.dimension(2);
+              shape.W = dst_tensor.dimension(3);
+              const feature::nhwc::Reader<T> from(&src_tensor);
+              feature::nchw::View<T> into(&dst_tensor);
+              feature::iterate(shape)
+                  << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+                       const auto value = from.at(batch, row, col, ch);
+                       into.at(batch, ch, row, col) = value;
+                     };
+              break;
             }
-            break;
-          }
-          case 3:
-          {
-            const int32_t dim_0 = dst_tensor.dimension(0);
-            const int32_t dim_1 = dst_tensor.dimension(1);
-            const int32_t copy_len = dst_tensor.dimension(2);
-
-            for (auto i = 0; i < dim_0; ++i)
+            case PermuteType::NCHW_TO_NHWC:
             {
-              for (auto j = 0; j < dim_1; ++j)
-              {
-                ir::Coordinates coords{i, j, 0};
-                memcpy(dst_buffer + dst_tensor.calcOffset(coords),
-                       src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T));
-              }
+              ir::FeatureShape shape;
+              shape.N = src_tensor.dimension(0);
+              shape.C = src_tensor.dimension(1);
+              shape.H = src_tensor.dimension(2);
+              shape.W = src_tensor.dimension(3);
+              const feature::nchw::Reader<T> from(&src_tensor);
+              feature::nhwc::View<T> into(&dst_tensor);
+              feature::iterate(shape)
+                  << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+                       const auto value = from.at(batch, ch, row, col);
+                       into.at(batch, row, col, ch) = value;
+                     };
+              break;
             }
-            break;
-          }
-          case 4:
-          {
-            switch (permute_type)
+            default:
             {
-              case PermuteType::NHWC_TO_NCHW:
-              {
-                ir::FeatureShape shape;
-                shape.N = dst_tensor.dimension(0);
-                shape.C = dst_tensor.dimension(1);
-                shape.H = dst_tensor.dimension(2);
-                shape.W = dst_tensor.dimension(3);
-                const feature::nhwc::Reader<T> from(&src_tensor);
-                feature::nchw::View<T> into(&dst_tensor);
-                feature::iterate(shape)
-                    << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                         const auto value = from.at(batch, row, col, ch);
-                         into.at(batch, ch, row, col) = value;
-                       };
-                break;
-              }
-              case PermuteType::NCHW_TO_NHWC:
-              {
-                ir::FeatureShape shape;
-                shape.N = src_tensor.dimension(0);
-                shape.C = src_tensor.dimension(1);
-                shape.H = src_tensor.dimension(2);
-                shape.W = src_tensor.dimension(3);
-                const feature::nchw::Reader<T> from(&src_tensor);
-                feature::nhwc::View<T> into(&dst_tensor);
-                feature::iterate(shape)
-                    << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                         const auto value = from.at(batch, ch, row, col);
-                         into.at(batch, row, col, ch) = value;
-                       };
-                break;
-              }
-              case PermuteType::COPY:
-              {
-                const int32_t dim_0 = dst_tensor.dimension(0);
-                const int32_t dim_1 = dst_tensor.dimension(1);
-                const int32_t dim_2 = dst_tensor.dimension(2);
-                const int32_t copy_len = dst_tensor.dimension(3);
-
-                for (auto i = 0; i < dim_0; ++i)
-                {
-                  for (auto j = 0; j < dim_1; ++j)
-                  {
-                    for (auto k = 0; k < dim_2; ++k)
-                    {
-                      ir::Coordinates coords{i, j, k, 0};
-                      memcpy(dst_buffer + dst_tensor.calcOffset(coords),
-                             src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T));
-                    }
-                  }
-                }
-                break;
-              }
-              default:
-              {
-                throw std::runtime_error("Unsupported Permutation");
-                break;
-              }
+              throw std::runtime_error("Unsupported Permutation");
+              break;
             }
-            break;
           }
-          default:
-            throw std::runtime_error("Unsupported rank in permutation");
-            break;
+        }
+        else if (!src_tensor.has_padding() && !dst_tensor.has_padding())
+        {
+          auto src_size = src_tensor.total_size();
+          assert(src_size <= dst_tensor.total_size());
+          memcpy(dst_tensor.buffer(), src_tensor.buffer(), src_size);
+        }
+        else
+        {
+          auto loop_shape = src_tensor.getShape();
+          const auto copy_axis = loop_shape.rank() - 1;
+          const auto copy_len = loop_shape.dim(copy_axis) * sizeof(T);
+          loop_shape.dim(copy_axis) = 1;
+          ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) {
+            memcpy(dst_tensor.buffer() + dst_tensor.calcOffset(coords),
+                   src_tensor.buffer() + src_tensor.calcOffset(coords), copy_len);
+          });
         }
       });
     };
@@ -275,8 +212,8 @@ private:
   }
 
 protected:
-  std::vector<std::shared_ptr<backend::ITensor>> _src_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> _dst_tensors;
+  std::vector<backend::ITensor *> _src_tensors;
+  std::vector<backend::ITensor *> _dst_tensors;
   // TODO Remove this member if it is possible
   std::vector<size_t> _ranks;
 };
diff --git a/runtime/onert/core/src/exec/LinearExecutor.cc b/runtime/onert/core/src/exec/LinearExecutor.cc
index 69dfe9b9b..6e6ca110f 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.cc
+++ b/runtime/onert/core/src/exec/LinearExecutor.cc
@@ -51,8 +51,10 @@ void LinearExecutor::executeImpl()
     _subject.notifyJobBegin(this, op_seq, backend);
 
     auto &fn_seq = code.fn_seq;
-    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput();
 
+    fn_seq->initRunning();
+
+    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput();
     fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
     fn_seq->run();
 
diff --git a/runtime/onert/core/src/exec/LinearExecutor.h b/runtime/onert/core/src/exec/LinearExecutor.h
index c224d3f4f..22d00ec30 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.h
+++ b/runtime/onert/core/src/exec/LinearExecutor.h
@@ -47,13 +47,11 @@ public:
    * @param code_map OpSequence and its code map
    */
   LinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                 const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                 const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                 const compiler::TensorRegistries &tensor_regs,
-                 backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map,
+                 const std::vector<backend::ITensor *> &input_tensors,
+                 const std::vector<backend::ITensor *> &output_tensors,
+                 const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
                  const std::vector<ir::OpSequenceIndex> &order)
-      : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
-                     std::move(tensor_mgrs)}
+      : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs}
   {
     for (auto index : order)
     {
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.cc b/runtime/onert/core/src/exec/ParallelExecutor.cc
index ab234aacd..676bdb5fa 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.cc
+++ b/runtime/onert/core/src/exec/ParallelExecutor.cc
@@ -59,14 +59,13 @@ void ParallelExecutor::notify(uint32_t finished_job_id)
   _cv_jobs.notify_all();
 }
 
-ParallelExecutor::ParallelExecutor(
-    std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-    const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-    const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-    const compiler::TensorRegistries &tensor_regs, backend::TensorManagerSet &&tensor_mgrs,
-    compiler::CodeMap &&code_map)
-    : DataflowExecutor{std::move(lowered_graph), input_tensors,      output_tensors, tensor_regs,
-                       std::move(tensor_mgrs),   std::move(code_map)}
+ParallelExecutor::ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                   const std::vector<backend::ITensor *> &input_tensors,
+                                   const std::vector<backend::ITensor *> &output_tensors,
+                                   const compiler::TensorRegistries &tensor_regs,
+                                   compiler::CodeMap &&code_map)
+    : DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs,
+                       std::move(code_map)}
 {
   VERBOSE(ParallelExecutor) << "Constructing Parallel Executor" << std::endl;
 }
@@ -133,6 +132,8 @@ void ParallelExecutor::executeImpl()
       notify(job_index);
     };
 
+    job->fn_seq()->initRunning();
+
     // dynamic tensor setting
     bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists;
     job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor);
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.h b/runtime/onert/core/src/exec/ParallelExecutor.h
index 929edfce9..111c20c0c 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.h
+++ b/runtime/onert/core/src/exec/ParallelExecutor.h
@@ -51,10 +51,9 @@ public:
    * @param code_map OpSequence and its code map
    */
   ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors,
-                   const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors,
-                   const compiler::TensorRegistries &tensor_regs,
-                   backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map);
+                   const std::vector<backend::ITensor *> &input_tensors,
+                   const std::vector<backend::ITensor *> &output_tensors,
+                   const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map);
 
   void executeImpl() override;
 
diff --git a/runtime/onert/core/src/exec/Sink.h b/runtime/onert/core/src/exec/Sink.h
deleted file mode 100644
index 6a99efe60..000000000
--- a/runtime/onert/core/src/exec/Sink.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_EXEC_SINK_H__
-#define __ONERT_EXEC_SINK_H__
-
-#include "feature/nchw/Reader.h"
-#include "feature/nchw/View.h"
-#include "feature/nhwc/Reader.h"
-#include "feature/nhwc/View.h"
-
-#include <cassert>
-#include <memory>
-#include "util/Utils.h"
-#include <misc/feature/IndexIterator.h>
-
-namespace onert
-{
-namespace exec
-{
-struct ISink
-{
-  virtual ~ISink() = default;
-
-  virtual void pull(::onert::backend::ITensor &tensor) const = 0;
-};
-
-// Create second lever inheritance: the first lever is used as a reference type in use-case places
-template <typename T> class ITemplSink : public ISink
-{
-public:
-  ITemplSink(void *output_buffer, const size_t &output_size, const ir::Shape &shape,
-             const bool copy, ir::Layout io_layout)
-      : _output_buffer{reinterpret_cast<T *>(output_buffer)}, _output_size{output_size},
-        _shape{shape}, _copy{copy}, _io_layout{io_layout}
-  {
-  }
-
-protected:
-  void pullUnif(onert::backend::ITensor &tensor) const
-  {
-    assert(((_io_layout == ir::Layout::NHWC && tensor.layout() == ir::Layout::NCHW) ||
-            (_io_layout == ir::Layout::NCHW && tensor.layout() == ir::Layout::NHWC)) ||
-           _copy);
-    auto input_buffer = tensor.buffer();
-    auto rank = _shape.rank();
-
-    if (!tensor.has_padding() && rank < 4 + _copy)
-    {
-      memcpy(_output_buffer, input_buffer, _output_size);
-      return;
-    }
-
-    switch (rank)
-    {
-      case 0:
-      case 1:
-      {
-        memcpy(_output_buffer, input_buffer, _output_size);
-        break;
-      }
-      case 2:
-      {
-        const int32_t copy_len = _shape.dim(1);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          ir::Coordinates coords{i, 0};
-          memcpy(_output_buffer + i * copy_len, input_buffer + tensor.calcOffset(coords),
-                 copy_len * sizeof(T));
-        }
-        break;
-      }
-      case 3:
-      {
-        const int32_t dim1 = _shape.dim(1);
-        const int32_t dim2 = _shape.dim(2);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          for (auto j = 0; j < _shape.dim(1); ++j)
-          {
-            ir::Coordinates coords{i, j, 0};
-            memcpy(_output_buffer + i * dim1 * dim2 + j * dim2,
-                   input_buffer + tensor.calcOffset(coords), dim2 * sizeof(T));
-          }
-        }
-        break;
-      }
-      case 4:
-      {
-        if (_copy)
-        {
-          const int32_t dim1 = _shape.dim(1);
-          const int32_t dim2 = _shape.dim(2);
-          const int32_t dim3 = _shape.dim(3);
-
-          for (auto i = 0; i < _shape.dim(0); ++i)
-          {
-            for (auto j = 0; j < _shape.dim(1); ++j)
-            {
-              for (auto k = 0; k < _shape.dim(2); ++k)
-              {
-                ir::Coordinates coords{i, j, k, 0};
-                memcpy(_output_buffer + i * dim1 * dim2 * dim3 + j * dim2 * dim3 + k * dim3,
-                       input_buffer + tensor.calcOffset(coords), dim3 * sizeof(T));
-              }
-            }
-          }
-        }
-        else
-        {
-          const auto shape = _shape.asFeature(_io_layout);
-
-          if (_io_layout == ir::Layout::NHWC)
-          {
-            const exec::feature::nchw::Reader<T> from(&tensor);
-            exec::feature::nhwc::View<T> into(shape, _output_buffer, _output_size);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, ch, row, col);
-                     into.at(batch, row, col, ch) = value;
-                   };
-          }
-          else if (_io_layout == ir::Layout::NCHW)
-          {
-            const exec::feature::nhwc::Reader<T> from(&tensor);
-            exec::feature::nchw::View<T> into(shape, _output_buffer, _output_size);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, row, col, ch);
-                     into.at(batch, ch, row, col) = value;
-                   };
-          }
-          else
-          {
-            throw std::runtime_error("Wrong Layout");
-          }
-        }
-        break;
-      }
-      default:
-        throw std::runtime_error("NYI: rank > 4");
-        break;
-    }
-  }
-
-private:
-  T *_output_buffer;
-  const size_t _output_size;
-  const ir::Shape _shape;
-  const bool _copy;
-  const ir::Layout _io_layout;
-};
-
-template <typename T> class PermutateSink final : public ITemplSink<T>
-{
-public:
-  PermutateSink(void *output_buffer, const size_t &output_size, const ir::Shape &shape,
-                ir::Layout io_layout)
-      : ITemplSink<T>(output_buffer, output_size, shape, false, io_layout)
-  {
-  }
-
-public:
-  void pull(onert::backend::ITensor &tensor) const override { ITemplSink<T>::pullUnif(tensor); }
-};
-
-// Only supports NHWC format front-end(NNAPI) now
-template <typename T> class CopySink final : public ITemplSink<T>
-{
-public:
-  CopySink(void *output_buffer, const size_t &output_size, const ir::Shape &shape,
-           ir::Layout io_layout = ir::Layout::UNKNOWN)
-      : ITemplSink<T>(output_buffer, output_size, shape, true, io_layout)
-  {
-  }
-
-public:
-  void pull(onert::backend::ITensor &tensor) const override { ITemplSink<T>::pullUnif(tensor); }
-};
-
-} // namespace exec
-} // namespace onert
-
-#endif // __ONERT_EXEC_SINK_H__
diff --git a/runtime/onert/core/src/exec/Source.h b/runtime/onert/core/src/exec/Source.h
deleted file mode 100644
index fb2be4dd8..000000000
--- a/runtime/onert/core/src/exec/Source.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_EXEC_SOURCE_H__
-#define __ONERT_EXEC_SOURCE_H__
-
-#include "feature/IndexIterator.h"
-#include "feature/nchw/Reader.h"
-#include "feature/nchw/View.h"
-#include "feature/nhwc/Reader.h"
-#include "feature/nhwc/View.h"
-
-#include <cassert>
-#include <memory>
-#include "util/Utils.h"
-#include <ir/Layout.h>
-#include "ir/Shape.h"
-
-namespace onert
-{
-namespace exec
-{
-
-struct ISource
-{
-  virtual ~ISource() = default;
-
-  virtual void push(::onert::backend::ITensor &tensor) const = 0;
-};
-
-// Create second lever inheritance: the first lever is used as a reference type in use-case places
-template <typename T> class ITemplSource : public ISource
-{
-public:
-  ITemplSource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape,
-               const bool copy, ir::Layout io_layout)
-      : _input_buffer{reinterpret_cast<const T *>(input_buffer)}, _input_size{input_size},
-        _shape{shape}, _copy(copy), _io_layout{io_layout}
-  {
-  }
-
-  virtual void push(::onert::backend::ITensor &tensor) const = 0;
-
-protected:
-  void pushUnif(onert::backend::ITensor &tensor) const
-  {
-    assert(((_io_layout == ir::Layout::NHWC && tensor.layout() == ir::Layout::NCHW) ||
-            (_io_layout == ir::Layout::NCHW && tensor.layout() == ir::Layout::NHWC)) ||
-           _copy);
-    auto output_buffer = tensor.buffer();
-    auto rank = _shape.rank();
-
-    if (!tensor.has_padding() && rank < 4 + _copy)
-    {
-      memcpy(output_buffer, _input_buffer, _input_size);
-      return;
-    }
-
-    switch (rank)
-    {
-      case 0:
-      case 1:
-      {
-        memcpy(output_buffer, _input_buffer, _input_size);
-        break;
-      }
-      case 2:
-      {
-        const int32_t copy_len = _shape.dim(1);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          ir::Coordinates coords{i, 0};
-          memcpy(output_buffer + tensor.calcOffset(coords), _input_buffer + i * copy_len,
-                 copy_len * sizeof(T));
-        }
-        break;
-      }
-      case 3:
-      {
-        const int32_t dim1 = _shape.dim(1);
-        const int32_t dim2 = _shape.dim(2);
-
-        for (auto i = 0; i < _shape.dim(0); ++i)
-        {
-          for (auto j = 0; j < _shape.dim(1); ++j)
-          {
-            ir::Coordinates coords{i, j, 0};
-            memcpy(output_buffer + tensor.calcOffset(coords),
-                   _input_buffer + i * dim1 * dim2 + j * dim2, dim2 * sizeof(T));
-          }
-        }
-        break;
-      }
-      case 4:
-      {
-        if (_copy)
-        {
-          const int32_t dim1 = _shape.dim(1);
-          const int32_t dim2 = _shape.dim(2);
-          const int32_t dim3 = _shape.dim(3);
-          for (auto i = 0; i < _shape.dim(0); ++i)
-          {
-            for (auto j = 0; j < _shape.dim(1); ++j)
-            {
-              for (auto k = 0; k < _shape.dim(2); ++k)
-              {
-                ir::Coordinates coords{i, j, k, 0};
-                memcpy(output_buffer + tensor.calcOffset(coords),
-                       _input_buffer + i * dim1 * dim2 * dim3 + j * dim2 * dim3 + k * dim3,
-                       dim3 * sizeof(T));
-              }
-            }
-          }
-        }
-        else
-        {
-          const auto shape = _shape.asFeature(_io_layout);
-
-          if (_io_layout == ir::Layout::NCHW)
-          {
-            const exec::feature::nchw::Reader<T> from(shape, _input_buffer, _input_size);
-            exec::feature::nhwc::View<T> into(&tensor);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, ch, row, col);
-                     into.at(batch, row, col, ch) = value;
-                   };
-          }
-          else if (_io_layout == ir::Layout::NHWC)
-          {
-            const exec::feature::nhwc::Reader<T> from(shape, _input_buffer, _input_size);
-            exec::feature::nchw::View<T> into(&tensor);
-            feature::iterate(shape)
-                << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(batch, row, col, ch);
-                     into.at(batch, ch, row, col) = value;
-                   };
-          }
-          else
-          {
-            throw std::runtime_error("Wrong Layout");
-          }
-        }
-
-        break;
-      }
-      default:
-        throw std::runtime_error("NYI: rank > 4");
-        break;
-    }
-  }
-
-private:
-  const T *_input_buffer;
-  const size_t _input_size;
-  const ir::Shape _shape;
-  const bool _copy;
-  const ir::Layout _io_layout;
-};
-
-template <typename T> class PermutateSource final : public ITemplSource<T>
-{
-public:
-  PermutateSource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape,
-                  ir::Layout io_layout)
-      : ITemplSource<T>(input_buffer, input_size, shape, false, io_layout)
-  {
-  }
-
-public:
-  void push(onert::backend::ITensor &tensor) const override
-  {
-    // do NHWC_TO_NCHW or NCHW_TO_NHWC permutation
-    ITemplSource<T>::pushUnif(tensor);
-  }
-};
-
-template <typename T> class CopySource final : public ITemplSource<T>
-{
-public:
-  CopySource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape,
-             ir::Layout io_layout = ir::Layout::UNKNOWN)
-      : ITemplSource<T>(input_buffer, input_size, shape, true, io_layout)
-  {
-  }
-
-public:
-  void push(onert::backend::ITensor &tensor) const override { ITemplSource<T>::pushUnif(tensor); }
-};
-
-} // namespace exec
-} // namespace onert
-
-#endif // __ONERT_EXEC_SOURCE_H__
diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h
index 008a4b9d4..8b72d537d 100644
--- a/runtime/onert/core/src/interp/Tensor.h
+++ b/runtime/onert/core/src/interp/Tensor.h
@@ -171,7 +171,6 @@ public:
   int32_t data_offset() const override { return _info.typeInfo().offset(); }
   const ir::OperandInfo &tensorInfo() const override { return _info; }
   uint64_t num_elements() const override { return _info.shape().num_elements(); };
-  backend::IDynamicTensorManager *dynamic_tensor_manager() override { return nullptr; }
 
 private:
   const ir::OperandInfo _info;
diff --git a/runtime/onert/core/src/ir/Graph.cc b/runtime/onert/core/src/ir/Graph.cc
index fe8b1b443..605562ebc 100644
--- a/runtime/onert/core/src/ir/Graph.cc
+++ b/runtime/onert/core/src/ir/Graph.cc
@@ -103,7 +103,7 @@ void Graph::initializeUseDef()
 {
   operations().iterate([&](const OperationIndex &index, const Operation &node) -> void {
     auto outputs = node.getOutputs();
-    for (auto output : outputs)
+    for (auto output : outputs | ir::Remove::UNDEFINED)
     {
       operands().at(output).setDef(index);
     }
diff --git a/runtime/onert/core/src/ir/GraphIterator.cc b/runtime/onert/core/src/ir/GraphIterator.cc
index 4bea1a55d..ac67771c4 100644
--- a/runtime/onert/core/src/ir/GraphIterator.cc
+++ b/runtime/onert/core/src/ir/GraphIterator.cc
@@ -53,7 +53,7 @@ void PostDfsIterator<is_const>::iterate(GraphRef graph, const IterFn &fn) const
       return;
     visited[index] = true;
 
-    for (const auto output : node.getOutputs() | Remove::DUPLICATED)
+    for (const auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       const auto &operand = graph.operands().at(output);
       for (const auto &use : operand.getUses())
@@ -86,7 +86,7 @@ void PostDfsIterator<is_const>::iterateOpSeqs(LoweredGraphRef lowered_graph,
       return;
     visited[index] = true;
 
-    for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED)
+    for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       const auto &operand = lowered_graph.graph().operands().at(output);
       for (const auto &use : operand.getUses())
diff --git a/runtime/onert/core/src/ir/Operation.cc b/runtime/onert/core/src/ir/Operation.cc
index 04be8c0d9..4af878541 100644
--- a/runtime/onert/core/src/ir/Operation.cc
+++ b/runtime/onert/core/src/ir/Operation.cc
@@ -24,22 +24,33 @@ namespace ir
 {
 
 Operation::Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs,
-                     const OperandIndexSequence &outputs)
-    : _input_constr{input_constr}, _inputs{inputs}, _outputs{outputs}
+                     const OperandIndexSequence &outputs, OperandConstraint output_constr)
+    : _input_constr{input_constr}, _output_constr{output_constr}
 {
+  setInputs(inputs);
+  setOutputs(outputs);
 }
 
-Operation::Operation(OperandConstraint input_constr) : _input_constr{input_constr} {}
+Operation::Operation(OperandConstraint input_constr, OperandConstraint output_constr)
+    : _input_constr{input_constr}, _output_constr{output_constr}
+{
+}
 
 Operation::~Operation() = default;
 
 void Operation::setInputs(const OperandIndexSequence &indexes)
 {
-  assert(_input_constr.check(indexes.size()));
+  if (!_input_constr.check(indexes.size()))
+    throw std::runtime_error{"Invalid number of input tensors for this operation."};
   _inputs = indexes;
 }
 
-void Operation::setOutputs(const OperandIndexSequence &indexes) { _outputs = indexes; }
+void Operation::setOutputs(const OperandIndexSequence &indexes)
+{
+  if (!_output_constr.check(indexes.size()))
+    throw std::runtime_error{"Invalid number of output tensors for this operation."};
+  _outputs = indexes;
+}
 
 void Operation::replaceInputs(const OperandIndex &from, const OperandIndex &to)
 {
diff --git a/runtime/onert/core/src/ir/OperationDumper.cc b/runtime/onert/core/src/ir/OperationDumper.cc
index 48361f464..eecfe81cc 100644
--- a/runtime/onert/core/src/ir/OperationDumper.cc
+++ b/runtime/onert/core/src/ir/OperationDumper.cc
@@ -40,7 +40,7 @@ void dumpUnaryInputOp(const Operation &node, const std::string &adding_input = "
 void dumpBinaryInputOp(const Operation &node, const std::string &adding_input = "")
 {
   VERBOSE(LIR) << "* " << node.name() << std::endl;
-  VERBOSE(LIR) << "  - Inputs : Input(" << node.getInputs().at(0) << ", " << node.getInputs().at(0)
+  VERBOSE(LIR) << "  - Inputs : Input(" << node.getInputs().at(0) << ", " << node.getInputs().at(1)
                << ") " << adding_input << std::endl;
   VERBOSE(LIR) << "  - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl;
 }
@@ -72,7 +72,7 @@ OperationDumper::OperationDumper(const std::string &start_msg)
   VERBOSE(LIR) << start_msg << std::endl;
 }
 
-void OperationDumper::visit(const ArgMax &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const ArgMax &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const BatchToSpaceND &node)
 {
@@ -82,6 +82,20 @@ void OperationDumper::visit(const BatchToSpaceND &node)
   dumpUnaryInputOp(node, block_size);
 }
 
+void OperationDumper::visit(const BCQFullyConnected &node)
+{
+  VERBOSE(LIR) << "* " << node.name() << std::endl;
+  VERBOSE(LIR) << "  - Inputs : IFM(" << node.getInputs().at(BCQFullyConnected::Input::INPUT)
+               << ") WeightsBinary("
+               << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_BINARY)
+               << ") WeightsScales("
+               << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_SCALES)
+               << ") WeightsClusters("
+               << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_CLUSTERS) << ") Bias("
+               << node.getInputs().at(BCQFullyConnected::Input::BIAS) << ")" << std::endl;
+  VERBOSE(LIR) << "  - Output : OFM(" << node.getOutputs().at(0) << ")" << std::endl;
+}
+
 void OperationDumper::visit(const BinaryArithmetic &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const operation::BroadcastTo &node) { dumpBinaryInputOp(node); }
@@ -185,6 +199,7 @@ void OperationDumper::visit(const LocalResponseNormalization &node) { dumpUnaryI
 
 void OperationDumper::visit(const LSTM &node)
 {
+  VERBOSE(LIR) << "* " << node.name() << std::endl;
   VERBOSE(LIR)
       << "  - Inputs : Input(" << node.getInputs().at(LSTM::Input::INPUT)
       << ") Input To Input Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_INPUT_WEIGHTS)
@@ -209,12 +224,24 @@ void OperationDumper::visit(const LSTM &node)
       << node.getInputs().at(LSTM::Input::PROJECTION_WEIGHTS) << ") Projection Bias("
       << node.getInputs().at(LSTM::Input::PROJECTION_BIAS) << ") Output State In("
       << node.getInputs().at(LSTM::Input::OUTPUT_STATE_IN) << ") Cell State In("
-      << node.getInputs().at(LSTM::Input::CELL_STATE_IN) << ")" << std::endl;
+      << node.getInputs().at(LSTM::Input::CELL_STATE_IN);
+  if (node.getInputs().size() == 24)
+  {
+    VERBOSE(LIR) << ") Input Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::INPUT_LAYER_NORMALIZATION_WEIGHTS)
+                 << ") Forget Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::FORGET_LAYER_NORMALIZATION_WEIGHTS)
+                 << ") Cell Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::CELL_LAYER_NORMALIZATION_WEIGHTS)
+                 << ") Ouput Layer Normalization Weights("
+                 << node.getInputs().at(LSTM::Input::OUTPUT_LAYER_NORMALIZATION_WEIGHTS);
+  }
+  VERBOSE(LIR) << ")" << std::endl;
   VERBOSE(LIR) << "  - Output : Scratch Buffer("
                << node.getOutputs().at(LSTM::Output::SCRATCH_BUFFER) << ") Output State Out("
-               << node.getInputs().at(LSTM::Output::OUTPUT_STATE_OUT) << ") Cell State Out("
-               << node.getInputs().at(LSTM::Output::CELL_STATE_OUT) << ") Output("
-               << node.getInputs().at(LSTM::Output::OUTPUT) << ")" << std::endl;
+               << node.getOutputs().at(LSTM::Output::OUTPUT_STATE_OUT) << ") Cell State Out("
+               << node.getOutputs().at(LSTM::Output::CELL_STATE_OUT) << ") Output("
+               << node.getOutputs().at(LSTM::Output::OUTPUT) << ")" << std::endl;
 }
 
 void OperationDumper::visit(const Pack &node) { dumpPackingOp(node); }
@@ -279,7 +306,37 @@ void OperationDumper::visit(const Reshape &node)
   dumpUnaryInputOp(node, shape);
 }
 
-void OperationDumper::visit(const ResizeBilinear &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const ResizeBilinear &node)
+{
+  if (node.getInputs().size() == 1)
+  {
+    dumpUnaryInputOp(node);
+  }
+  else if (node.getInputs().size() == 2)
+  {
+    dumpBinaryInputOp(node);
+  }
+  else
+  {
+    VERBOSE(LIR) << "* " << node.name() << " is set wrong" << std::endl;
+  }
+}
+
+void OperationDumper::visit(const ResizeNearestNeighbor &node)
+{
+  if (node.getInputs().size() == 1)
+  {
+    dumpUnaryInputOp(node);
+  }
+  else if (node.getInputs().size() == 2)
+  {
+    dumpBinaryInputOp(node);
+  }
+  else
+  {
+    VERBOSE(LIR) << "* " << node.name() << " is set wrong" << std::endl;
+  }
+}
 
 void OperationDumper::visit(const Reverse &node)
 {
@@ -336,7 +393,7 @@ void OperationDumper::visit(const SpaceToBatchND &node)
 
 void OperationDumper::visit(const SpaceToDepth &node) { dumpUnaryInputOp(node); }
 
-void OperationDumper::visit(const Split &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const Split &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const SquaredDifference &node) { dumpBinaryInputOp(node); }
 
@@ -384,7 +441,7 @@ void OperationDumper::visit(const TransposeConv &node)
   VERBOSE(LIR) << "  - Output : OFM(" << node.getOutputs().at(0) << ")" << std::endl;
 }
 
-void OperationDumper::visit(const Transpose &node) { dumpUnaryInputOp(node); }
+void OperationDumper::visit(const Transpose &node) { dumpBinaryInputOp(node); }
 
 void OperationDumper::visit(const Unpack &node)
 {
diff --git a/runtime/onert/core/src/ir/OperationDumper.h b/runtime/onert/core/src/ir/OperationDumper.h
index e8ab3b3cd..91642ab13 100644
--- a/runtime/onert/core/src/ir/OperationDumper.h
+++ b/runtime/onert/core/src/ir/OperationDumper.h
@@ -33,6 +33,7 @@ public:
 public:
   void visit(const operation::ArgMax &) override;
   void visit(const operation::BatchToSpaceND &node) override;
+  void visit(const operation::BCQFullyConnected &node) override;
   void visit(const operation::BinaryArithmetic &node) override;
   void visit(const operation::BroadcastTo &) override;
   void visit(const operation::Comparison &) override;
@@ -65,6 +66,7 @@ public:
   void visit(const operation::Reduce &) override;
   void visit(const operation::Reshape &node) override;
   void visit(const operation::ResizeBilinear &) override;
+  void visit(const operation::ResizeNearestNeighbor &) override;
   void visit(const operation::Reverse &) override;
   void visit(const operation::RNN &) override;
   void visit(const operation::Select &node) override;
diff --git a/runtime/onert/core/src/ir/operation/ArgMax.cc b/runtime/onert/core/src/ir/operation/ArgMax.cc
index 1275ae43a..f3bd8fd73 100644
--- a/runtime/onert/core/src/ir/operation/ArgMax.cc
+++ b/runtime/onert/core/src/ir/operation/ArgMax.cc
@@ -31,7 +31,7 @@ void ArgMax::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ArgMax::ArgMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
index 9ef2b125f..34be79dd2 100644
--- a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
+++ b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
@@ -31,7 +31,7 @@ void BatchToSpaceND::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BatchToSpaceND::BatchToSpaceND(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
index 7dfcd4a98..6a0be7eb8 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
@@ -32,7 +32,9 @@ void ElementwiseUnary::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ElementwiseUnary::ElementwiseUnary(const OperandIndexSequence &inputs,
                                    const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createExact(1u), inputs, outputs,
+                OperandConstraint::createExact(1u)},
+      _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Fill.cc b/runtime/onert/core/src/ir/operation/Fill.cc
index c44f45aab..b8b97d1c0 100644
--- a/runtime/onert/core/src/ir/operation/Fill.cc
+++ b/runtime/onert/core/src/ir/operation/Fill.cc
@@ -30,7 +30,7 @@ namespace operation
 void Fill::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Fill::Fill(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/FullyConnected.cc b/runtime/onert/core/src/ir/operation/FullyConnected.cc
index 118ae554a..9837a3137 100644
--- a/runtime/onert/core/src/ir/operation/FullyConnected.cc
+++ b/runtime/onert/core/src/ir/operation/FullyConnected.cc
@@ -31,7 +31,7 @@ void FullyConnected::accept(OperationVisitor &v) const { v.visit(*this); }
 
 FullyConnected::FullyConnected(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/LSTM.cc b/runtime/onert/core/src/ir/operation/LSTM.cc
index 30a865326..5cd7c793a 100644
--- a/runtime/onert/core/src/ir/operation/LSTM.cc
+++ b/runtime/onert/core/src/ir/operation/LSTM.cc
@@ -31,10 +31,18 @@ void LSTM::accept(OperationVisitor &v) const { v.visit(*this); }
 
 LSTM::LSTM(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
            const Param &param)
-    : Operation{OperandConstraint::createExact(23u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(20u, 24u), inputs, outputs}, _param{param}
 {
 }
 
+std::string LSTM::name() const
+{
+  if (getOutputs().at(Output::SCRATCH_BUFFER).undefined())
+    return std::string{"UnidirectionalSequenceLSTM"};
+  else
+    return Operation::name();
+}
+
 } // namespace operation
 } // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/operation/Pack.cc b/runtime/onert/core/src/ir/operation/Pack.cc
index f0908a2c6..784d4162a 100644
--- a/runtime/onert/core/src/ir/operation/Pack.cc
+++ b/runtime/onert/core/src/ir/operation/Pack.cc
@@ -25,7 +25,7 @@ namespace operation
 void Pack::accept(OperationVisitor &v) const { v.visit(*this); }
 Pack::Pack(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
            const Param &param)
-    : Operation{OperandConstraint::createAtLeast(3u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
index d0d89f45f..71925bb44 100644
--- a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
+++ b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
@@ -31,7 +31,7 @@ void ResizeBilinear::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ResizeBilinear::ResizeBilinear(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
index 9f17af97c..98d0b5f26 100644
--- a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
+++ b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
@@ -32,7 +32,7 @@ void ResizeNearestNeighbor::accept(OperationVisitor &v) const { v.visit(*this);
 ResizeNearestNeighbor::ResizeNearestNeighbor(const OperandIndexSequence &inputs,
                                              const OperandIndexSequence &outputs,
                                              const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Split.cc b/runtime/onert/core/src/ir/operation/Split.cc
index 244884e41..b538e9206 100644
--- a/runtime/onert/core/src/ir/operation/Split.cc
+++ b/runtime/onert/core/src/ir/operation/Split.cc
@@ -25,7 +25,7 @@ namespace operation
 void Split::accept(OperationVisitor &v) const { v.visit(*this); }
 Split::Split(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
              const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/Transpose.cc b/runtime/onert/core/src/ir/operation/Transpose.cc
index 3a663fbce..997f98ab0 100644
--- a/runtime/onert/core/src/ir/operation/Transpose.cc
+++ b/runtime/onert/core/src/ir/operation/Transpose.cc
@@ -29,9 +29,8 @@ namespace operation
 
 void Transpose::accept(OperationVisitor &v) const { v.visit(*this); }
 
-Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
-                     const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/verifier/Verifier.cc b/runtime/onert/core/src/ir/verifier/Verifier.cc
index 09cbdcf2f..489845971 100644
--- a/runtime/onert/core/src/ir/verifier/Verifier.cc
+++ b/runtime/onert/core/src/ir/verifier/Verifier.cc
@@ -51,7 +51,7 @@ bool DAGChecker::verify(const Graph &graph) const noexcept
     visited[index] = true;
     on_stack[index] = true;
 
-    for (auto output : node.getOutputs() | Remove::DUPLICATED)
+    for (auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
       const auto &operand = graph.operands().at(output);
       for (const auto &use : operand.getUses())
@@ -99,7 +99,7 @@ bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept
         errors += 1;
       }
     }
-    for (auto operand_index : node.getOutputs())
+    for (auto operand_index : node.getOutputs() | ir::Remove::UNDEFINED)
     {
       try
       {
diff --git a/runtime/onert/core/src/util/EventCollectorGlobal.cc b/runtime/onert/core/src/util/EventCollectorGlobal.cc
index d09b95210..6c03a5b9a 100644
--- a/runtime/onert/core/src/util/EventCollectorGlobal.cc
+++ b/runtime/onert/core/src/util/EventCollectorGlobal.cc
@@ -21,6 +21,7 @@
 #include <iostream>
 
 #include "util/ConfigSource.h"
+#include "util/EventWriter.h"
 
 namespace onert
 {
@@ -39,8 +40,8 @@ EventCollectorGlobal::~EventCollectorGlobal()
     try
     {
       // TODO Need better way for saved file path than the hardcoded path
-      std::ofstream ofs{"trace.global.json"};
-      _recorder.writeToFile(ofs);
+      EventWriter{_recorder}.writeToFile("trace.global.json",
+                                         EventWriter::WriteFormat::CHROME_TRACING);
     }
     catch (const std::exception &e)
     {
diff --git a/runtime/onert/core/src/util/EventRecorder.cc b/runtime/onert/core/src/util/EventRecorder.cc
index 13a599bed..3714e4f02 100644
--- a/runtime/onert/core/src/util/EventRecorder.cc
+++ b/runtime/onert/core/src/util/EventRecorder.cc
@@ -16,389 +16,6 @@
 
 #include "util/EventRecorder.h"
 
-#include <sstream>
-#include <vector>
-#include <unordered_map>
-#include <json/json.h>
-#include <assert.h>
-#include <utility>
-#include <map>
-#include <set>
-#include <stdint.h>
-
-// json type for Chrome Event Trace
-namespace
-{
-
-std::string quote(const std::string &value)
-{
-  std::stringstream ss;
-  ss << '"' << value << '"';
-  return ss.str();
-}
-
-std::string field(const std::string &k, const std::string &v)
-{
-  std::stringstream ss;
-  ss << quote(k) << " : " << quote(v);
-  return ss.str();
-}
-
-struct Content // One Entry in Chrome Event Trace
-{
-  std::vector<std::pair<std::string, std::string>> flds;
-  std::vector<std::pair<std::string, std::string>> args;
-};
-
-std::string object(const Content &content)
-{
-  std::stringstream ss;
-
-  ss << "{ ";
-
-  ss << field(content.flds[0].first, content.flds[0].second);
-
-  for (uint32_t n = 1; n < content.flds.size(); ++n)
-  {
-    ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second);
-  }
-
-  if (content.args.size() > 0)
-  {
-    ss << ", " << quote("args") << " : { ";
-    ss << field(content.args.at(0).first, content.args.at(0).second);
-
-    for (uint32_t n = 1; n < content.args.size(); ++n)
-    {
-      ss << ", " << field(content.args.at(n).first, content.args.at(n).second);
-    }
-
-    ss << "}";
-  }
-
-  ss << " }";
-
-  return ss.str();
-}
-
-void fill(Content &content, const Event &evt)
-{
-  content.flds.emplace_back("name", evt.name);
-  content.flds.emplace_back("pid", "0");
-  content.flds.emplace_back("tid", evt.tid);
-  content.flds.emplace_back("ph", evt.ph);
-  content.flds.emplace_back("ts", evt.ts);
-}
-
-std::string object(const DurationEvent &evt)
-{
-  Content content;
-
-  fill(content, evt);
-
-  return ::object(content);
-}
-
-std::string object(const CounterEvent &evt)
-{
-  Content content;
-
-  fill(content, evt);
-
-  for (auto it = evt.values.begin(); it != evt.values.end(); ++it)
-  {
-    content.args.emplace_back(it->first, it->second);
-  }
-
-  return ::object(content);
-}
-
-} // namespace
-
-// md table type
-namespace
-{
-
-void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
-{
-  os << "| ";
-  for (auto &key : list)
-  {
-    os << key << " | ";
-  }
-  os << "\n";
-}
-
-struct MDContent
-{
-  std::string name;
-  uint64_t begin_ts;
-  uint64_t end_ts;
-  uint32_t min_rss;
-  uint32_t max_rss;
-  uint32_t min_page_reclaims;
-  uint32_t max_page_reclaims;
-
-  MDContent()
-      : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX),
-        max_page_reclaims(0)
-  {
-    // DO NOTHING
-  }
-
-  virtual ~MDContent() = default;
-
-  void updateRss(uint32_t rss)
-  {
-    if (min_rss == UINT32_MAX)
-      min_rss = rss;
-    if (max_rss == 0)
-      max_rss = rss;
-
-    if (min_rss > rss)
-      min_rss = rss;
-    else if (max_rss < rss)
-      max_rss = rss;
-  }
-
-  void updateMinflt(uint32_t minflt)
-  {
-    if (min_page_reclaims == UINT32_MAX)
-      min_page_reclaims = minflt;
-    if (max_page_reclaims == 0)
-      max_page_reclaims = minflt;
-
-    if (min_page_reclaims > minflt)
-      min_page_reclaims = minflt;
-    else if (max_page_reclaims < minflt)
-      max_page_reclaims = minflt;
-  }
-
-  virtual void write(std::ostream &os) const = 0;
-};
-
-struct OpSeq : public MDContent
-{
-  std::string backend;
-  uint64_t graph_latency;
-
-  struct OpSeqCmp
-  {
-    bool operator()(const OpSeq &lhs, const OpSeq &rhs) const
-    {
-      return lhs.begin_ts < rhs.begin_ts;
-    }
-    bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
-    bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
-  };
-
-  void write(std::ostream &os) const override
-  {
-    uint64_t opseq_latency = end_ts - begin_ts;
-    double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0;
-    writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per),
-                         std::to_string(min_rss), std::to_string(max_rss),
-                         std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)});
-  }
-};
-
-struct Graph : public MDContent
-{
-  std::set<OpSeq, OpSeq::OpSeqCmp> opseqs;
-
-  void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq)
-  {
-    uint64_t graph_latency = end_ts - begin_ts;
-    for (auto it : name_to_opseq)
-    {
-      auto opseq = it.second;
-      opseq.graph_latency = graph_latency;
-
-      opseqs.insert(opseq);
-
-      updateRss(opseq.min_rss);
-      updateRss(opseq.max_rss);
-      updateMinflt(opseq.min_page_reclaims);
-      updateMinflt(opseq.max_page_reclaims);
-    }
-  }
-
-  void write(std::ostream &os) const override
-  {
-    static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)",
-                                                  "page_reclaims_min", "page_reclaims_max"};
-
-    static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------",
-                                                       "-----------------", "-----------------"};
-
-    // Graph's Header
-    writeMDTableRow(os, graph_headers);
-    writeMDTableRow(os, graph_headers_line);
-
-    // Graph's contents
-    writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss),
-                         std::to_string(max_rss), std::to_string(min_page_reclaims),
-                         std::to_string(max_page_reclaims)});
-
-    os << "\n";
-
-    static std::vector<std::string> opseq_headers{
-        "OpSeq name",  "backend",     "latency(us)",       "latency(%)",
-        "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"};
-
-    static std::vector<std::string> opseq_headers_line{
-        "----------", "-------", "-----------",       "-----------",
-        "-------",    "-------", "-----------------", "-----------------"};
-
-    os << "## OpSequences \n";
-
-    // OpSeq's Header
-    writeMDTableRow(os, opseq_headers);
-    writeMDTableRow(os, opseq_headers_line);
-
-    // OpSeq's contents
-    for (auto opseq : opseqs)
-    {
-      opseq.write(os);
-    }
-
-    os << "\n";
-  }
-};
-
-struct MDTableBuilder
-{
-  MDTableBuilder(const std::vector<DurationEvent> &duration_events,
-                 const std::vector<CounterEvent> &counter_events)
-      : _duration_events(duration_events), _counter_events(counter_events)
-  {
-    for (const auto &evt : _counter_events)
-    {
-      uint64_t ts = std::stoull(evt.ts);
-      auto &name = evt.name;
-      assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0);
-      assert(evt.values.size() == 1);
-      auto &val = evt.values.begin()->second;
-      if (_ts_to_values.find(ts) == _ts_to_values.end())
-      {
-        std::pair<uint32_t, uint32_t> values;
-        if (name.compare("maxrss") == 0)
-          values.first = std::stoul(val);
-        else
-          values.second = std::stoul(val);
-        _ts_to_values.insert({ts, values});
-      }
-      else
-      {
-        auto &values = _ts_to_values.at(ts);
-        if (name.compare("maxrss") == 0)
-          values.first = std::stoul(val);
-        else
-          values.second = std::stoul(val);
-      }
-    }
-  }
-
-  MDTableBuilder &build()
-  {
-    for (auto &it : divideGraph())
-    {
-      size_t begin_idx = it.first;
-      size_t end_idx = it.second;
-      std::map<std::string, OpSeq> name_to_opseq;
-      for (size_t i = begin_idx + 1; i < end_idx; ++i)
-      {
-        const auto &evt = _duration_events[i];
-        assert(evt.name.compare("Graph") != 0);
-        assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0);
-        if (evt.ph.compare("B") == 0)
-        {
-          assert(name_to_opseq.find(evt.name) == name_to_opseq.end());
-          name_to_opseq.insert({evt.name, makeOpSeq(evt)});
-        }
-        else
-        {
-          assert(name_to_opseq.find(evt.name) != name_to_opseq.end());
-          auto &opseq = name_to_opseq.at(evt.name);
-          updateOpSeq(opseq, evt);
-        }
-      }
-
-      _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq));
-    }
-
-    return *this;
-  }
-
-  std::vector<std::pair<size_t, size_t>> divideGraph()
-  {
-    std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx>
-    for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i)
-    {
-      const auto &evt = _duration_events.at(i);
-      if (evt.name.compare("Graph") == 0)
-      {
-        if (evt.ph.compare("B") == 0)
-          begin_idx = i;
-        else
-          graph_idx_list.emplace_back(begin_idx, i);
-      }
-    }
-    return graph_idx_list;
-  }
-
-  OpSeq makeOpSeq(const DurationEvent &evt)
-  {
-    OpSeq opseq;
-    opseq.name = evt.name;
-    opseq.begin_ts = std::stoull(evt.ts);
-    opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first);
-    opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second);
-    opseq.backend = evt.tid;
-    return opseq;
-  }
-
-  void updateOpSeq(OpSeq &opseq, const DurationEvent &evt)
-  {
-    opseq.end_ts = std::stoull(evt.ts);
-    opseq.updateRss(_ts_to_values.at(opseq.end_ts).first);
-    opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second);
-  }
-
-  Graph makeGraph(size_t begin_idx, size_t end_idx,
-                  const std::map<std::string, OpSeq> &name_to_opseq)
-  {
-    Graph graph;
-    graph.name = "Graph";
-    graph.begin_ts = std::stoull(_duration_events[begin_idx].ts);
-    graph.updateRss(_ts_to_values.at(graph.begin_ts).first);
-    graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second);
-    graph.end_ts = std::stoull(_duration_events[end_idx].ts);
-    graph.updateRss(_ts_to_values.at(graph.end_ts).first);
-    graph.updateMinflt(_ts_to_values.at(graph.end_ts).second);
-    graph.setOpSeqs(name_to_opseq);
-    return graph;
-  }
-
-  void write(std::ostream &os)
-  {
-    // Write contents
-    for (size_t i = 0; i < _graphs.size(); ++i)
-    {
-      os << "# Graph " << i << "\n";
-      _graphs.at(i).write(os);
-    }
-  }
-
-  const std::vector<DurationEvent> &_duration_events;
-  const std::vector<CounterEvent> &_counter_events;
-  // timestamp to std::pair<maxrss, minflt>
-  std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values;
-  std::vector<Graph> _graphs;
-};
-
-} // namespace
-
 void EventRecorder::emit(const DurationEvent &evt)
 {
   std::lock_guard<std::mutex> lock{_mu};
@@ -412,146 +29,3 @@ void EventRecorder::emit(const CounterEvent &evt)
 
   _counter_events.push_back(evt);
 }
-
-void EventRecorder::writeToFile(std::ostream &os)
-{
-  std::lock_guard<std::mutex> lock{_mu};
-
-  switch (_write_format)
-  {
-    case WriteFormat::CHROME_TRACING:
-      writeChromeTrace(os);
-      break;
-    case WriteFormat::SNPE_BENCHMARK:
-      writeSNPEBenchmark(os);
-      break;
-    case WriteFormat::MD_TABLE:
-      writeMDTable(os);
-      break;
-    default:
-      assert(!"Invalid value");
-      break;
-  }
-}
-
-void EventRecorder::writeSNPEBenchmark(std::ostream &os)
-{
-  Json::Value root;
-  auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
-
-  struct Stat
-  {
-    uint64_t sum = 0;
-    uint64_t count = 0;
-    uint64_t max = 0;
-    uint64_t min = std::numeric_limits<uint64_t>::max();
-
-    void accumulate(uint64_t val)
-    {
-      sum += val;
-      count++;
-      max = std::max(max, val);
-      min = std::min(min, val);
-    }
-  };
-
-  // Memory
-  {
-    std::unordered_map<std::string, Stat> mem_stats;
-    for (auto &evt : _counter_events)
-    {
-      auto &mem_stat = mem_stats[evt.name];
-      uint64_t val = std::stoull(evt.values["value"]);
-      mem_stat.accumulate(val);
-    }
-
-    auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
-    for (auto &kv : mem_stats)
-    {
-      auto &key = kv.first;
-      auto &val = kv.second;
-      mem[key]["Avg_Size"] = val.sum / val.count;
-      mem[key]["Max_Size"] = val.max;
-      mem[key]["Min_Size"] = val.min;
-      mem[key]["Runtime"] = "NA";
-    }
-  }
-
-  // Operation Execution Time
-  {
-    // NOTE This assumes _duration_events is sorted by "ts" ascending
-
-    // 2D keys : stats[tid][name]
-    std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
-    std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
-    for (auto &evt : _duration_events)
-    {
-      auto &stat = stats[evt.tid][evt.name];
-      auto &begin_ts = begin_timestamps[evt.tid][evt.name];
-      uint64_t timestamp = std::stoull(evt.ts);
-      if (evt.ph == "B")
-      {
-        if (begin_ts != 0)
-          throw std::runtime_error{"Invalid Data"};
-        begin_ts = timestamp;
-      }
-      else if (evt.ph == "E")
-      {
-        if (begin_ts == 0 || timestamp < begin_ts)
-          throw std::runtime_error{"Invalid Data"};
-        stat.accumulate(timestamp - begin_ts);
-        begin_ts = 0;
-      }
-      else
-        throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
-    }
-
-    for (auto &kv : begin_timestamps)
-      for (auto &kv2 : kv.second)
-        if (kv2.second != 0)
-          throw std::runtime_error{"Invalid Data - B and E pair does not match."};
-
-    for (auto &kv : stats)
-    {
-      auto &tid = kv.first;
-      auto &map = kv.second;
-      auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
-      for (auto &kv : map)
-      {
-        auto &name = kv.first;
-        auto &val = kv.second;
-        json_tid[name]["Avg_Time"] = val.sum / val.count;
-        json_tid[name]["Max_Time"] = val.max;
-        json_tid[name]["Min_Time"] = val.min;
-        json_tid[name]["Runtime"] = tid;
-      }
-    }
-  }
-
-  os << root;
-}
-
-void EventRecorder::writeChromeTrace(std::ostream &os)
-{
-  os << "{\n";
-  os << "  " << quote("traceEvents") << ": [\n";
-
-  for (auto &evt : _duration_events)
-  {
-    os << "    " << object(evt) << ",\n";
-  }
-
-  for (auto &evt : _counter_events)
-  {
-    os << "    " << object(evt) << ",\n";
-  }
-
-  os << "    { }\n";
-  os << "  ]\n";
-  os << "}\n";
-}
-
-void EventRecorder::writeMDTable(std::ostream &os)
-{
-  MDTableBuilder(_duration_events, _counter_events).build().write(os);
-}
diff --git a/runtime/onert/core/src/util/EventRecorder.h b/runtime/onert/core/src/util/EventRecorder.h
index 37ec1a0f1..7af4c7ddb 100644
--- a/runtime/onert/core/src/util/EventRecorder.h
+++ b/runtime/onert/core/src/util/EventRecorder.h
@@ -21,7 +21,6 @@
 #include <memory>
 #include <mutex>
 
-#include <ostream>
 #include <vector>
 
 struct Event
@@ -50,14 +49,6 @@ struct CounterEvent : public Event
 class EventRecorder
 {
 public:
-  enum class WriteFormat
-  {
-    CHROME_TRACING,
-    SNPE_BENCHMARK,
-    MD_TABLE,
-  };
-
-public:
   EventRecorder() = default;
 
 public:
@@ -66,18 +57,11 @@ public:
 
 public:
   bool empty() { return _duration_events.empty() && _counter_events.empty(); }
-  void writeToFile(std::ostream &os);
-  void setWriteFormat(WriteFormat write_format) { _write_format = write_format; }
-
-private:
-  void writeSNPEBenchmark(std::ostream &os);
-  void writeChromeTrace(std::ostream &os);
-  void writeMDTable(std::ostream &os);
+  const std::vector<DurationEvent> &duration_events() const { return _duration_events; }
+  const std::vector<CounterEvent> &counter_events() const { return _counter_events; }
 
 private:
   std::mutex _mu;
-  // TODO: Allow user to control write_format
-  WriteFormat _write_format{WriteFormat::SNPE_BENCHMARK};
   std::vector<DurationEvent> _duration_events;
   std::vector<CounterEvent> _counter_events;
 };
diff --git a/runtime/onert/core/src/util/EventWriter.cc b/runtime/onert/core/src/util/EventWriter.cc
new file mode 100644
index 000000000..dacb40e64
--- /dev/null
+++ b/runtime/onert/core/src/util/EventWriter.cc
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/EventWriter.h"
+
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <json/json.h>
+#include <assert.h>
+#include <utility>
+#include <map>
+#include <set>
+#include <stdint.h>
+#include <fstream>
+
+// json type for Chrome Event Trace
+namespace
+{
+
+std::string quote(const std::string &value)
+{
+  std::stringstream ss;
+  ss << '"' << value << '"';
+  return ss.str();
+}
+
+std::string field(const std::string &k, const std::string &v)
+{
+  std::stringstream ss;
+  ss << quote(k) << " : " << quote(v);
+  return ss.str();
+}
+
+struct Content // One Entry in Chrome Event Trace
+{
+  std::vector<std::pair<std::string, std::string>> flds;
+  std::vector<std::pair<std::string, std::string>> args;
+};
+
+std::string object(const Content &content)
+{
+  std::stringstream ss;
+
+  ss << "{ ";
+
+  ss << field(content.flds[0].first, content.flds[0].second);
+
+  for (uint32_t n = 1; n < content.flds.size(); ++n)
+  {
+    ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second);
+  }
+
+  if (content.args.size() > 0)
+  {
+    ss << ", " << quote("args") << " : { ";
+    ss << field(content.args.at(0).first, content.args.at(0).second);
+
+    for (uint32_t n = 1; n < content.args.size(); ++n)
+    {
+      ss << ", " << field(content.args.at(n).first, content.args.at(n).second);
+    }
+
+    ss << "}";
+  }
+
+  ss << " }";
+
+  return ss.str();
+}
+
+void fill(Content &content, const Event &evt)
+{
+  content.flds.emplace_back("name", evt.name);
+  content.flds.emplace_back("pid", "0");
+  content.flds.emplace_back("tid", evt.tid);
+  content.flds.emplace_back("ph", evt.ph);
+  content.flds.emplace_back("ts", evt.ts);
+}
+
+std::string object(const DurationEvent &evt)
+{
+  Content content;
+
+  fill(content, evt);
+
+  return ::object(content);
+}
+
+std::string object(const CounterEvent &evt)
+{
+  Content content;
+
+  fill(content, evt);
+
+  for (auto it = evt.values.begin(); it != evt.values.end(); ++it)
+  {
+    content.args.emplace_back(it->first, it->second);
+  }
+
+  return ::object(content);
+}
+
+} // namespace
+
+// md table type
+namespace
+{
+
+void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
+{
+  os << "| ";
+  for (auto &key : list)
+  {
+    os << key << " | ";
+  }
+  os << "\n";
+}
+
+struct MDContent
+{
+  std::string name;
+  uint64_t begin_ts;
+  uint64_t end_ts;
+  uint32_t min_rss;
+  uint32_t max_rss;
+  uint32_t min_page_reclaims;
+  uint32_t max_page_reclaims;
+
+  MDContent()
+      : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX),
+        max_page_reclaims(0)
+  {
+    // DO NOTHING
+  }
+
+  virtual ~MDContent() = default;
+
+  void updateRss(uint32_t rss)
+  {
+    if (min_rss == UINT32_MAX)
+      min_rss = rss;
+    if (max_rss == 0)
+      max_rss = rss;
+
+    if (min_rss > rss)
+      min_rss = rss;
+    else if (max_rss < rss)
+      max_rss = rss;
+  }
+
+  void updateMinflt(uint32_t minflt)
+  {
+    if (min_page_reclaims == UINT32_MAX)
+      min_page_reclaims = minflt;
+    if (max_page_reclaims == 0)
+      max_page_reclaims = minflt;
+
+    if (min_page_reclaims > minflt)
+      min_page_reclaims = minflt;
+    else if (max_page_reclaims < minflt)
+      max_page_reclaims = minflt;
+  }
+
+  virtual void write(std::ostream &os) const = 0;
+};
+
+struct OpSeq : public MDContent
+{
+  std::string backend;
+  uint64_t graph_latency;
+
+  struct OpSeqCmp
+  {
+    bool operator()(const OpSeq &lhs, const OpSeq &rhs) const
+    {
+      return lhs.begin_ts < rhs.begin_ts;
+    }
+    bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
+    bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
+  };
+
+  void write(std::ostream &os) const override
+  {
+    uint64_t opseq_latency = end_ts - begin_ts;
+    double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0;
+    writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per),
+                         std::to_string(min_rss), std::to_string(max_rss),
+                         std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)});
+  }
+};
+
+struct Graph : public MDContent
+{
+  std::set<OpSeq, OpSeq::OpSeqCmp> opseqs;
+
+  void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq)
+  {
+    uint64_t graph_latency = end_ts - begin_ts;
+    for (auto it : name_to_opseq)
+    {
+      auto opseq = it.second;
+      opseq.graph_latency = graph_latency;
+
+      opseqs.insert(opseq);
+
+      updateRss(opseq.min_rss);
+      updateRss(opseq.max_rss);
+      updateMinflt(opseq.min_page_reclaims);
+      updateMinflt(opseq.max_page_reclaims);
+    }
+  }
+
+  void write(std::ostream &os) const override
+  {
+    static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)",
+                                                  "page_reclaims_min", "page_reclaims_max"};
+
+    static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------",
+                                                       "-----------------", "-----------------"};
+
+    // Graph's Header
+    writeMDTableRow(os, graph_headers);
+    writeMDTableRow(os, graph_headers_line);
+
+    // Graph's contents
+    writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss),
+                         std::to_string(max_rss), std::to_string(min_page_reclaims),
+                         std::to_string(max_page_reclaims)});
+
+    os << "\n";
+
+    static std::vector<std::string> opseq_headers{
+        "OpSeq name",  "backend",     "latency(us)",       "latency(%)",
+        "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"};
+
+    static std::vector<std::string> opseq_headers_line{
+        "----------", "-------", "-----------",       "-----------",
+        "-------",    "-------", "-----------------", "-----------------"};
+
+    os << "## OpSequences \n";
+
+    // OpSeq's Header
+    writeMDTableRow(os, opseq_headers);
+    writeMDTableRow(os, opseq_headers_line);
+
+    // OpSeq's contents
+    for (auto opseq : opseqs)
+    {
+      opseq.write(os);
+    }
+
+    os << "\n";
+  }
+};
+
+struct MDTableBuilder
+{
+  MDTableBuilder(const std::vector<DurationEvent> &duration_events,
+                 const std::vector<CounterEvent> &counter_events)
+      : _duration_events(duration_events), _counter_events(counter_events)
+  {
+// when ready with low overhead in release build
+#ifdef DEBUG
+    for (const auto &evt : _counter_events)
+    {
+      uint64_t ts = std::stoull(evt.ts);
+      auto &name = evt.name;
+      assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0);
+      assert(evt.values.size() == 1);
+      auto &val = evt.values.begin()->second;
+      if (_ts_to_values.find(ts) == _ts_to_values.end())
+      {
+        std::pair<uint32_t, uint32_t> values;
+        if (name.compare("maxrss") == 0)
+          values.first = std::stoul(val);
+        else
+          values.second = std::stoul(val);
+        _ts_to_values.insert({ts, values});
+      }
+      else
+      {
+        auto &values = _ts_to_values.at(ts);
+        if (name.compare("maxrss") == 0)
+          values.first = std::stoul(val);
+        else
+          values.second = std::stoul(val);
+      }
+    }
+#endif
+  }
+
+  MDTableBuilder &build()
+  {
+    for (auto &it : divideGraph())
+    {
+      size_t begin_idx = it.first;
+      size_t end_idx = it.second;
+      std::map<std::string, OpSeq> name_to_opseq;
+      for (size_t i = begin_idx + 1; i < end_idx; ++i)
+      {
+        const auto &evt = _duration_events[i];
+        assert(evt.name.compare("Graph") != 0);
+        assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0);
+        if (evt.ph.compare("B") == 0)
+        {
+          assert(name_to_opseq.find(evt.name) == name_to_opseq.end());
+          name_to_opseq.insert({evt.name, makeOpSeq(evt)});
+        }
+        else
+        {
+          assert(name_to_opseq.find(evt.name) != name_to_opseq.end());
+          auto &opseq = name_to_opseq.at(evt.name);
+          updateOpSeq(opseq, evt);
+        }
+      }
+
+      _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq));
+    }
+
+    return *this;
+  }
+
+  std::vector<std::pair<size_t, size_t>> divideGraph()
+  {
+    std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx>
+    for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i)
+    {
+      const auto &evt = _duration_events.at(i);
+      if (evt.name.compare("Graph") == 0)
+      {
+        if (evt.ph.compare("B") == 0)
+          begin_idx = i;
+        else
+          graph_idx_list.emplace_back(begin_idx, i);
+      }
+    }
+    return graph_idx_list;
+  }
+
+  OpSeq makeOpSeq(const DurationEvent &evt)
+  {
+    OpSeq opseq;
+    opseq.name = evt.name;
+    opseq.begin_ts = std::stoull(evt.ts);
+    opseq.backend = evt.tid;
+#ifdef DEBUG
+    opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first);
+    opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second);
+#else
+    opseq.updateRss(0);
+    opseq.updateMinflt(0);
+#endif
+    return opseq;
+  }
+
+  void updateOpSeq(OpSeq &opseq, const DurationEvent &evt)
+  {
+    opseq.end_ts = std::stoull(evt.ts);
+#ifdef DEBUG
+    opseq.updateRss(_ts_to_values.at(opseq.end_ts).first);
+    opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second);
+#else
+    opseq.updateRss(0);
+    opseq.updateMinflt(0);
+#endif
+  }
+
+  Graph makeGraph(size_t begin_idx, size_t end_idx,
+                  const std::map<std::string, OpSeq> &name_to_opseq)
+  {
+    Graph graph;
+    graph.name = "Graph";
+    graph.begin_ts = std::stoull(_duration_events[begin_idx].ts);
+    graph.end_ts = std::stoull(_duration_events[end_idx].ts);
+    graph.setOpSeqs(name_to_opseq);
+#ifdef DEBUG
+    graph.updateRss(_ts_to_values.at(graph.begin_ts).first);
+    graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second);
+    graph.updateRss(_ts_to_values.at(graph.end_ts).first);
+    graph.updateMinflt(_ts_to_values.at(graph.end_ts).second);
+#else
+    graph.updateRss(0);
+    graph.updateMinflt(0);
+#endif
+    return graph;
+  }
+
+  void write(std::ostream &os)
+  {
+    // Write contents
+    for (size_t i = 0; i < _graphs.size(); ++i)
+    {
+      os << "# Graph " << i << "\n";
+      _graphs.at(i).write(os);
+    }
+  }
+
+  const std::vector<DurationEvent> &_duration_events;
+  const std::vector<CounterEvent> &_counter_events;
+  // timestamp to std::pair<maxrss, minflt>
+  std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values;
+  std::vector<Graph> _graphs;
+};
+
+} // namespace
+
+EventWriter::EventWriter(const EventRecorder &recorder) : _recorder(recorder)
+{
+  // DO NOTHING
+}
+
+void EventWriter::writeToFiles(const std::string &base_filepath)
+{
+  // Note. According to an internal issue, let snpe json as just file name not '.snpe.json'
+  writeToFile(base_filepath, WriteFormat::SNPE_BENCHMARK);
+  writeToFile(base_filepath + ".chrome.json", WriteFormat::CHROME_TRACING);
+  writeToFile(base_filepath + ".table.md", WriteFormat::MD_TABLE);
+}
+
+void EventWriter::writeToFile(const std::string &filepath, WriteFormat write_format)
+{
+  std::ofstream os{filepath, std::ofstream::out};
+  switch (write_format)
+  {
+    case WriteFormat::CHROME_TRACING:
+      writeChromeTrace(os);
+      break;
+    case WriteFormat::SNPE_BENCHMARK:
+      writeSNPEBenchmark(os);
+      break;
+    case WriteFormat::MD_TABLE:
+      writeMDTable(os);
+      break;
+    default:
+      assert(!"Invalid value");
+      break;
+  }
+}
+
+void EventWriter::writeSNPEBenchmark(std::ostream &os)
+{
+  Json::Value root;
+  auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
+
+  struct Stat
+  {
+    uint64_t sum = 0;
+    uint64_t count = 0;
+    uint64_t max = 0;
+    uint64_t min = std::numeric_limits<uint64_t>::max();
+
+    void accumulate(uint64_t val)
+    {
+      sum += val;
+      count++;
+      max = std::max(max, val);
+      min = std::min(min, val);
+    }
+  };
+
+  // Memory
+  {
+    std::unordered_map<std::string, Stat> mem_stats;
+    for (auto &evt : _recorder.counter_events())
+    {
+      auto &mem_stat = mem_stats[evt.name];
+      uint64_t val = std::stoull(evt.values.at("value"));
+      mem_stat.accumulate(val);
+    }
+
+    auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
+    for (auto &kv : mem_stats)
+    {
+      auto &key = kv.first;
+      auto &val = kv.second;
+      mem[key]["Avg_Size"] = val.sum / val.count;
+      mem[key]["Max_Size"] = val.max;
+      mem[key]["Min_Size"] = val.min;
+      mem[key]["Runtime"] = "NA";
+    }
+  }
+
+  // Operation Execution Time
+  {
+    // NOTE This assumes _duration_events is sorted by "ts" ascending
+
+    // 2D keys : stats[tid][name]
+    std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
+    std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
+    for (auto &evt : _recorder.duration_events())
+    {
+      auto &stat = stats[evt.tid][evt.name];
+      auto &begin_ts = begin_timestamps[evt.tid][evt.name];
+      uint64_t timestamp = std::stoull(evt.ts);
+      if (evt.ph == "B")
+      {
+        if (begin_ts != 0)
+          throw std::runtime_error{"Invalid Data"};
+        begin_ts = timestamp;
+      }
+      else if (evt.ph == "E")
+      {
+        if (begin_ts == 0 || timestamp < begin_ts)
+          throw std::runtime_error{"Invalid Data"};
+        stat.accumulate(timestamp - begin_ts);
+        begin_ts = 0;
+      }
+      else
+        throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
+    }
+
+    for (auto &kv : begin_timestamps)
+      for (auto &kv2 : kv.second)
+        if (kv2.second != 0)
+          throw std::runtime_error{"Invalid Data - B and E pair does not match."};
+
+    for (auto &kv : stats)
+    {
+      auto &tid = kv.first;
+      auto &map = kv.second;
+      auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
+      for (auto &kv : map)
+      {
+        auto &name = kv.first;
+        auto &val = kv.second;
+        json_tid[name]["Avg_Time"] = val.sum / val.count;
+        json_tid[name]["Max_Time"] = val.max;
+        json_tid[name]["Min_Time"] = val.min;
+        json_tid[name]["Runtime"] = tid;
+      }
+    }
+  }
+
+  os << root;
+}
+
+void EventWriter::writeChromeTrace(std::ostream &os)
+{
+  os << "{\n";
+  os << "  " << quote("traceEvents") << ": [\n";
+
+  for (auto &evt : _recorder.duration_events())
+  {
+    os << "    " << object(evt) << ",\n";
+  }
+
+  for (auto &evt : _recorder.counter_events())
+  {
+    os << "    " << object(evt) << ",\n";
+  }
+
+  os << "    { }\n";
+  os << "  ]\n";
+  os << "}\n";
+}
+
+void EventWriter::writeMDTable(std::ostream &os)
+{
+  MDTableBuilder(_recorder.duration_events(), _recorder.counter_events()).build().write(os);
+}
diff --git a/runtime/onert/core/src/util/EventWriter.h b/runtime/onert/core/src/util/EventWriter.h
new file mode 100644
index 000000000..7e838ca82
--- /dev/null
+++ b/runtime/onert/core/src/util/EventWriter.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_UTIL_EVENT_WRITER_H__
+#define __ONERT_UTIL_EVENT_WRITER_H__
+
+#include "EventRecorder.h"
+
+#include <string>
+#include <ostream>
+
+class EventWriter
+{
+public:
+  enum class WriteFormat
+  {
+    CHROME_TRACING,
+    SNPE_BENCHMARK,
+    MD_TABLE,
+  };
+
+public:
+  EventWriter(const EventRecorder &recorder);
+
+public:
+  void writeToFiles(const std::string &base_filepath);
+  void writeToFile(const std::string &filepath, WriteFormat write_format);
+
+private:
+  void writeSNPEBenchmark(std::ostream &os);
+  void writeChromeTrace(std::ostream &os);
+  void writeMDTable(std::ostream &os);
+
+private:
+  const EventRecorder &_recorder;
+};
+
+#endif // __ONERT_UTIL_EVENT_WRITER_H__
diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc
index 95c15049d..0278df4d2 100644
--- a/runtime/onert/core/src/util/ShapeInference.cc
+++ b/runtime/onert/core/src/util/ShapeInference.cc
@@ -22,6 +22,7 @@
 #include "util/logging.h"
 
 #include <cassert>
+#include <numeric>
 #include <sstream>
 #include <cmath>
 
@@ -72,6 +73,19 @@ ir::Shape broadcastShapes(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape
 
 } // namespace
 
+namespace bcq
+{
+inline int getOutputSize(const ir::Shape &cluster_shape, const int32_t *cluster_buf)
+{
+  int size = 0;
+  for (int idx = 0; idx < cluster_shape.dim(0); idx++)
+  {
+    size += cluster_buf[idx * 2 + 1];
+  }
+  return size;
+}
+} // namespace bcq
+
 //
 // Shape inference
 //
@@ -116,6 +130,11 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha
 
 ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank)
 {
+  if (axis < 0 || axis >= rank)
+  {
+    throw std::runtime_error("ArgMax shape inference: Wrong axis value " + std::to_string(axis));
+  }
+
   ir::Shape out_shape;
   for (int idx = 0; idx < rank; ++idx)
   {
@@ -259,19 +278,24 @@ ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs
   return output_shape;
 }
 
-ir::Shape inferBroadcastToShape(const ir::Shape wshape, const int32_t *shape_buffer)
+/*
+ * shp_shape : SHAPE input tensor's shape
+ * shp_buf : SHAPE input tensor's buffer
+ */
+ir::Shape inferBroadcastToShape(const ir::Shape shp_shape, const int32_t *shp_buf)
 {
-  const int num_elements = wshape.num_elements();
+
+  const int num_elements = shp_shape.num_elements();
 
   assert(num_elements != 0);
-  assert(shape_buffer);
+  assert(shp_buf);
 
   ir::Shape new_shape(num_elements);
 
   for (int i = 0; i < num_elements; ++i)
   {
-    assert(shape_buffer[i] != 0); // It shouldn't be 0.
-    new_shape.dim(i) = shape_buffer[i];
+    assert(shp_buf[i] != 0); // It shouldn't be 0.
+    new_shape.dim(i) = shp_buf[i];
   }
 
   return new_shape;
@@ -305,6 +329,9 @@ ir::Shape inferConcatShape(const Shapes &in_shapes, const ir::operation::Concat:
 ir::Shape inferConv2DShape(const ir::Shape &in_shape, const ir::Shape &ker_shape,
                            const ir::operation::Conv2D::Param &param, ir::Layout layout)
 {
+  if (param.stride.horizontal == 0 || param.stride.vertical == 0)
+    throw std::runtime_error{"Conv2D: stride values must be positive"};
+
   auto ifm_shape = in_shape.asFeature(layout);
 
   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]
@@ -321,6 +348,9 @@ ir::Shape inferDepthwiseConv2DShape(const ir::Shape &in_shape, const ir::Shape &
                                     const ir::operation::DepthwiseConv2D::Param &param,
                                     ir::Layout layout)
 {
+  if (param.stride.horizontal == 0 || param.stride.vertical == 0)
+    throw std::runtime_error{"DepthwiseConv2D: stride values must be positive"};
+
   assert(layout == ir::Layout::NHWC);
   auto ifm_shape = in_shape.asFeature(layout);
 
@@ -354,13 +384,13 @@ ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis)
   return out_shape;
 }
 
-ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *buffer)
+ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf)
 {
   ir::Shape out_shape(in_shape.dim(0));
 
   for (int out_x = 0; out_x < out_shape.rank(); ++out_x)
   {
-    out_shape.dim(out_x) = buffer[out_x];
+    out_shape.dim(out_x) = in_buf[out_x];
   }
 
   return out_shape;
@@ -380,11 +410,60 @@ ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &k
   return {ir::Shape({static_cast<int32_t>(batch_size), num_units})};
 }
 
+ir::Shape inferBCQFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &cluster_shape,
+                                      const int32_t *cluster_buf)
+{
+  assert(cluster_shape.rank() == 2);
+  assert(cluster_shape.dim(1) == 2);
+
+  const auto input_size = in_shape.dim(1);
+  const auto output_size = bcq::getOutputSize(cluster_shape, cluster_buf);
+
+  return {ir::Shape({output_size, input_size})};
+}
+
+ir::Shape inferBCQGatherShape(const ir::Shape &indices_shape, const ir::Shape &cluster_shape,
+                              const int32_t *cluster_buf, int rank,
+                              const ir::operation::BCQGather::Param &param)
+{
+  ir::Shape out_shape;
+  ir::Shape in_original_shape;
+
+  assert(cluster_shape.rank() == 2);
+  assert(cluster_shape.dim(1) == 2);
+
+  auto hidden_size = param.input_hidden_size;
+  auto axis = param.axis;
+
+  in_original_shape.append(bcq::getOutputSize(cluster_shape, cluster_buf));
+  in_original_shape.append(hidden_size);
+
+  const int indices_rank = indices_shape.rank();
+  for (int idx = 0; idx < rank; ++idx)
+  {
+    if (idx == (int)axis)
+    {
+      for (int indices_idx = 0; indices_idx < indices_rank; indices_idx++)
+      {
+        out_shape.append(indices_shape.dim(indices_idx));
+      }
+    }
+    else
+    {
+      out_shape.append(in_original_shape.dim(idx));
+    }
+  }
+
+  return out_shape;
+}
+
 ir::Shape inferGatherShape(const ir::Shape &input_shape, const ir::Shape &indices_shape, int axis,
                            int rank)
 {
   ir::Shape out_shape;
+
   const int indices_rank = indices_shape.rank();
+
   for (int idx = 0; idx < rank; ++idx)
   {
     if (idx == axis)
@@ -470,6 +549,9 @@ ir::Shape inferPadShape(const ir::Shape &in_shape, const int32_t *pad_buf, const
 ir::Shape inferPoolShape(const ir::Shape &in_shape, const ir::operation::Pool2D::Param &param,
                          const ir::Layout layout)
 {
+  if (param.stride.horizontal == 0 || param.stride.vertical == 0)
+    throw std::runtime_error{"Pool2D: stride values must be positive"};
+
   assert(layout == ir::Layout::NHWC);
   auto ifm_shape = in_shape.asFeature(layout);
   const auto out_h_w = calcConvLikeHeightAndWidth(ifm_shape.H, ifm_shape.W, param.kh, param.kw,
@@ -482,6 +564,17 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp
                                    const int32_t output_width)
 {
   assert(in_shape.rank() == 4);
+  if (output_height < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_height = " +
+                             std::to_string(output_height)};
+  }
+  if (output_width < 0)
+  {
+    throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_width = " +
+                             std::to_string(output_width)};
+  }
+
   ir::Shape ret(in_shape.rank());
 
   ret.dim(0) = in_shape.dim(0);
@@ -613,7 +706,8 @@ ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &i
   return new_shape;
 }
 
-ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, const int32_t *sizes)
+ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
+                          const int32_t *sizes_buf)
 {
   const uint32_t rank = input_shape.rank();
   ir::Shape out_shape(rank);
@@ -623,12 +717,12 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, c
     const auto input_dim = input_shape.dim(idx);
 
     // begin is zero-based
-    auto begin = begins[idx];
+    auto begin = begins_buf[idx];
     if (begin < 0)
       throw std::runtime_error("shape inference Slice: Invalid begin.");
 
     // size is one-based
-    auto size = sizes[idx];
+    auto size = sizes_buf[idx];
     if (size < -1)
       throw std::runtime_error("shape inference Slice: Invalid size.");
 
@@ -648,8 +742,8 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, c
 }
 
 ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape,
-                                   const ir::Shape &padding_shape, const int32_t *block_shape_data,
-                                   const int32_t *padding_data)
+                                   const ir::Shape &padding_shape, const int32_t *block_shape_buf,
+                                   const int32_t *padding_buf)
 {
   const uint32_t rank = input_shape.rank();
   ir::Shape out_shape(rank);
@@ -677,14 +771,14 @@ ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim)
   {
     int final_dim_size =
-        (input_shape.dim(dim + 1) + padding_data[dim * 2] + padding_data[dim * 2 + 1]);
+        (input_shape.dim(dim + 1) + padding_buf[dim * 2] + padding_buf[dim * 2 + 1]);
 
-    assert(final_dim_size % block_shape_data[dim] == 0);
+    assert(final_dim_size % block_shape_buf[dim] == 0);
 
-    out_shape.dim(dim + 1) = final_dim_size / block_shape_data[dim];
+    out_shape.dim(dim + 1) = final_dim_size / block_shape_buf[dim];
   }
 
-  const int output_batch_size = input_shape.dim(0) * block_shape_data[0] * block_shape_data[1];
+  const int output_batch_size = input_shape.dim(0) * block_shape_buf[0] * block_shape_buf[1];
   const int output_channel_size = input_shape.dim(3);
 
   out_shape.dim(0) = output_batch_size;
@@ -948,35 +1042,71 @@ ir::Shape inferStridedSliceShape(const ir::Shape &input_shape, const StridedSlic
   return out_shape;
 }
 
-ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier)
+ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_buf,
+                         const int32_t multiplier_size)
 {
-  // assert(in_shape.rank() == multiplier.rank());
+  if (multiplier_size != in_shape.rank())
+  {
+    throw std::runtime_error("inferTileShape failed, input rank: " +
+                             std::to_string(in_shape.rank()) + ", bad multipliers size: " +
+                             std::to_string(multiplier_size) + "");
+  }
   ir::Shape new_Shape(in_shape.rank());
 
   for (int i = 0; i < in_shape.rank(); ++i)
   {
-    assert(multiplier[i]); // multiplier[i] shuld not be 0.
-    new_Shape.dim(i) = in_shape.dim(i) * multiplier[i];
+    assert(multiplier_buf[i]); // multiplier_buf[i] shuld not be 0.
+    new_Shape.dim(i) = in_shape.dim(i) * multiplier_buf[i];
   }
   return new_Shape;
 }
 
-ir::Shape inferTransposeShape(const ir::Shape &in_shape, const std::vector<int> &perm)
+ir::Shape inferTransposeShape(const ir::Shape &in_shape, const int32_t *perm_buf,
+                              const int32_t perm_size)
 {
-  if (static_cast<int>(perm.size()) > in_shape.rank())
+  const auto rank = in_shape.rank();
+  if (perm_size > rank)
+  {
+    throw std::runtime_error("inferTransposeShape failed, bad permutation size: " +
+                             std::to_string(perm_size));
+  }
+
+  const int32_t *perm_data = perm_buf;
+  std::vector<int32_t> regular_perm_vec;
+  if (perm_size == 0)
+  {
+    // perm_data will be set to (n-1...0)
+    regular_perm_vec.resize(rank);
+    std::iota(regular_perm_vec.begin(), regular_perm_vec.end(), 0);
+    std::reverse(regular_perm_vec.begin(), regular_perm_vec.end());
+    perm_data = regular_perm_vec.data();
+  }
+  else
   {
-    throw std::runtime_error("inferTransposeShape failed, bad rank size: " +
-                             std::to_string(static_cast<int>(perm.size())));
+    assert(rank == perm_size);
   }
-  ir::Shape out_shape(static_cast<int>(perm.size()));
-  for (int idx = 0; idx < static_cast<int>(perm.size()); idx++)
+
+  ir::Shape out_shape(rank);
+  std::vector<bool> visit_perms(rank, false);
+  for (int idx = 0; idx < rank; idx++)
   {
-    if (perm[idx] < 0 || perm[idx] >= static_cast<int>(perm.size()))
+    const auto perm_val = perm_data[idx];
+    // Check invalid permutation value
+    if (perm_val < 0 || perm_val >= rank)
     {
-      throw std::runtime_error("inferTransposeShape failed, bad perm value: " +
-                               std::to_string(perm[idx]));
+      throw std::runtime_error("inferTransposeShape failed, bad permutation value: " +
+                               std::to_string(perm_val));
     }
-    out_shape.dim(idx) = in_shape.dim(perm[idx]);
+
+    // Check duplicated permutation value
+    if (visit_perms.at(perm_val))
+    {
+      throw std::runtime_error("inferTransposeShape failed, duplicated permutation value: " +
+                               std::to_string(perm_val));
+    }
+    visit_perms.at(perm_val) = true;
+
+    out_shape.dim(idx) = in_shape.dim(perm_val);
   }
   return out_shape;
 }
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index 480452e01..d21001e59 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -38,7 +39,7 @@ namespace onert
 namespace base_loader
 {
 
-template <typename LoaderDomain, typename SpecificLoader> class BaseLoader
+template <typename LoaderDomain> class BaseLoader
 {
 protected:
   using Verifier = typename LoaderDomain::Verifier;
@@ -69,6 +70,7 @@ public:
   explicit BaseLoader(std::unique_ptr<ir::Subgraphs> &subgs)
       : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr}
   {
+    _use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA);
   }
 
   /**
@@ -93,7 +95,6 @@ protected:
   ir::Activation convertActivation(ActivationFunctionType type);
   ir::DataType tensorTypeToDataType(TensorType type);
   ir::OperandIndex tensorIdxToOperandIdx(int32_t tensorIdx);
-  void deallocateMmappedArea(uint8_t *ptr, size_t size);
 
   // Create operands form tflite::Tensor
   ir::OperandIndex loadOperand(const Tensor *tensor, ir::Graph &subg);
@@ -107,7 +108,11 @@ protected:
   // Load Pool2D param
   template <typename Param> void loadPool2DOptions(Param &param, const Pool2DOptions *options);
 
+private:
+  virtual std::unique_ptr<ir::Graph> loadSubgraph(const SubGraph *subg) = 0;
   // Operations
+  template <typename OpIR, typename... Args>
+  const OpIR *loadOperationTo(const Operator *op, ir::Graph &subg, Args &&... args);
   void loadConv2D(const Operator *op, ir::Graph &subg);
   void loadDepthwiseConv2D(const Operator *op, ir::Graph &subg);
   void loadTransposeConv(const Operator *op, ir::Graph &subg);
@@ -115,62 +120,50 @@ protected:
   void loadReshape(const Operator *op, ir::Graph &subg);
   void loadSoftmax(const Operator *op, ir::Graph &subg);
   void loadConcatenation(const Operator *op, ir::Graph &subg);
-  void loadFill(const Operator *op, ir::Graph &subg);
   void loadFC(const Operator *op, ir::Graph &subg);
-  template <ir::operation::BinaryArithmetic::ArithmeticType op_type>
-  void loadBinaryArithmetic(const Operator *op, ir::Graph &subg);
+  void loadBinaryArithmetic(const Operator *op, ir::Graph &subg,
+                            ir::operation::BinaryArithmetic::ArithmeticType op_type);
   void loadAddV2(const Operator *op, ir::Graph &subg);
   void loadPack(const Operator *op, ir::Graph &subg);
   void loadResizeBilinear(const Operator *op, ir::Graph &subg);
   void loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg);
-  void loadSelect(const Operator *op, ir::Graph &subg);
-  void loadSquaredDifference(const Operator *op, ir::Graph &subg);
-  void loadTranspose(const Operator *op, ir::Graph &subg);
-  template <ir::operation::Reduce::ReduceType reduce_type>
-  void loadReduce(const Operator *op, ir::Graph &subg);
+  void loadReduce(const Operator *op, ir::Graph &subg,
+                  ir::operation::Reduce::ReduceType reduce_type);
   void loadReduceAll(const Operator *op, ir::Graph &subg);
-  void loadReverseV2(const Operator *op, ir::Graph &subg);
-  void loadPad(const Operator *op, ir::Graph &subg);
   void loadElementwiseActivation(const Operator *op, ir::Graph &subg,
                                  ir::operation::ElementwiseActivation::Type op_type,
                                  float alpha = 0.f, float beta = 0.f);
-  template <ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type>
-  void loadElementwiseBinary(const Operator *op, ir::Graph &subg);
+  void loadElementwiseBinary(const Operator *op, ir::Graph &subg,
+                             ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type);
   void loadElementwiseUnary(const Operator *op, ir::Graph &subg,
                             ir::operation::ElementwiseUnary::Type op_type);
-  void loadExpandDims(const Operator *op, ir::Graph &subg);
   void loadGather(const Operator *op, ir::Graph &subg);
   void loadCustom(const Operator *op, ir::Graph &subg);
-  void loadSpaceToBatchND(const Operator *op, ir::Graph &subg);
   void loadBatchMatMul(const Operator *op, ir::Graph &subg);
-  void loadBatchToSpaceND(const Operator *op, ir::Graph &subg);
   void loadSqueeze(const Operator *op, ir::Graph &subg);
-  void loadPrelu(const Operator *op, ir::Graph &subg);
   void loadSplit(const Operator *op, ir::Graph &subg);
   void loadSplitV(const Operator *op, ir::Graph &subg);
-  void loadSlice(const Operator *op, ir::Graph &subg);
   void loadStridedSlice(const Operator *op, ir::Graph &subg);
   void loadUnpack(const Operator *op, ir::Graph &subg);
   void loadComparison(const Operator *op, ir::Graph &subg);
   void loadEinsum(const Operator *op, ir::Graph &subg);
   void loadOneHot(const Operator *op, ir::Graph &subg);
-  void loadShape(const Operator *op, ir::Graph &subg);
   void loadIf(const Operator *op, ir::Graph &subg);
   void loadWhile(const Operator *op, ir::Graph &subg);
   void loadArgMax(const Operator *op, ir::Graph &subg);
-  void loadPow(const Operator *op, ir::Graph &subg);
-  void loadTile(const Operator *op, ir::Graph &subg);
-  void loadRange(const Operator *op, ir::Graph &subg);
-  void loadRank(const Operator *op, ir::Graph &subg);
-  void loadMatrixBandPart(const Operator *op, ir::Graph &subg);
-  void loadBroadcastTo(const Operator *op, ir::Graph &subg);
   void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
   void loadLogSoftmax(const Operator *op, ir::Graph &subg);
   void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
-  void loadStatelessRandomUniform(const Operator *op, ir::Graph &subg);
-  void loadL2Normalization(const Operator *op, ir::Graph &subg);
   void loadLeakyRelu(const Operator *op, ir::Graph &subg);
 
+  void verifySubgraphIndex(int subg_index)
+  {
+    const auto num_subgraphs = _model->subgraphs()->size();
+    if (subg_index < 0 || subg_index >= static_cast<int32_t>(num_subgraphs))
+      throw std::runtime_error{std::string{"Invalid subgraph index - "} +
+                               std::to_string(subg_index)};
+  }
+
 protected:
   // Base address for mapped region for loading (if needed)
   uint8_t *_base;
@@ -186,10 +179,12 @@ protected:
   std::unordered_map<ir::OperandIndex, std::string> _tensor_names;
   // Verifier
   std::unique_ptr<Verifier> _verifier;
+  // Boolean flag to use MMAPED_DATA
+  bool _use_mmaped_data = false;
 };
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const char *file_path)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::BaseLoader::loadFromFile(const char *file_path)
 {
   _fd = open(file_path, O_RDONLY);
   if (_fd < 0)
@@ -216,22 +211,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const ch
   _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size);
 
   loadModel();
+  munmap(_base, size);
 
   close(_fd);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromBuffer(uint8_t *buffer,
-                                                                          size_t size)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::BaseLoader::loadFromBuffer(uint8_t *buffer, size_t size)
 {
   _base = buffer;
   _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size);
   loadModel();
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActivation(
-    const ActivationFunctionType type)
+template <typename LoaderDomain>
+ir::Activation
+BaseLoader<LoaderDomain>::BaseLoader::convertActivation(const ActivationFunctionType type)
 {
   switch (type)
   {
@@ -246,14 +241,13 @@ ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActi
     case ActivationFunctionType::ActivationFunctionType_TANH:
       return ir::Activation::TANH;
     default:
-      throw std::runtime_error(std::string("Unsupported activation type: ")
-                                   .append(EnumNameActivationFunctionType(type)));
+      throw std::runtime_error(std::string("Unsupported or invalid activation type: ") +
+                               std::to_string(static_cast<int>(type)));
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::DataType
-BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorTypeToDataType(const TensorType type)
+template <typename LoaderDomain>
+ir::DataType BaseLoader<LoaderDomain>::BaseLoader::tensorTypeToDataType(const TensorType type)
 {
   switch (type)
   {
@@ -275,39 +269,13 @@ BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorTypeToDataType(const
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::OperandIndex
-BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorIdxToOperandIdx(int32_t tensorIdx)
+template <typename LoaderDomain>
+ir::OperandIndex BaseLoader<LoaderDomain>::BaseLoader::tensorIdxToOperandIdx(int32_t tensorIdx)
 {
   return isOptionalInputTensor(tensorIdx) ? ir::OperandIndex() : _tensor_to_operand[tensorIdx];
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::deallocateMmappedArea(uint8_t *ptr,
-                                                                                 size_t size)
-{
-  // Calculate offset from base address of mapped region
-  ptrdiff_t unaligned_offset_start = ptr - _base;
-  ptrdiff_t unaligned_offset_end = unaligned_offset_start + size;
-
-  // Calculated aligned offset from base address of mapped region
-  // munmap accepts memory address which is a multiple of the pagesize
-  ptrdiff_t aligned_offset_start =
-      ((unaligned_offset_start + (_pagesize - 1)) / _pagesize) * _pagesize;
-  ptrdiff_t aligned_offset_end = (unaligned_offset_end / _pagesize) * _pagesize;
-
-  ptrdiff_t area_size = aligned_offset_end - aligned_offset_start;
-  if (area_size > 0)
-  {
-    // Unmap mapped region for CachedData
-    if (munmap(_base + aligned_offset_start, area_size) == -1)
-    {
-      VERBOSE(BASE_LOADER) << "munmap failed" << std::endl;
-    }
-  }
-}
-
-/* Copied from tensorflow lite. Need to append copyright */
+/* Copy is copied from tensorflow lite */
 template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr)
 {
   if (data_ptr->values() == nullptr)
@@ -324,9 +292,8 @@ template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr)
   return true;
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Tensor *tensor,
-                                                                       ir::Graph &subg)
+template <typename LoaderDomain>
+ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir::Graph &subg)
 {
   ir::Shape shape;
   // Shape
@@ -386,18 +353,44 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
   {
     std::vector<uint16_t> w1_segments;
     std::vector<uint16_t> w1_indices;
-    // ignore traversal_order, block_map
+    // check traversal_order
+    if (src_sparsity->traversal_order())
+    {
+      const int traversal_order_size = src_sparsity->traversal_order()->size();
+      for (int i = 0; i < traversal_order_size; ++i)
+      {
+        if (i != src_sparsity->traversal_order()->Get(i))
+          throw std::runtime_error("traversal_order [0, 1, ..., n-1] is only supported.");
+      }
+    }
+    // check block_map
+    int block_rank = 0;
+    if (src_sparsity->block_map())
+    {
+      block_rank = src_sparsity->block_map()->size();
+      for (int i = 0; i < block_rank; ++i)
+      {
+        if (i != src_sparsity->block_map()->Get(i))
+          throw std::runtime_error("block_map [0, 1, ..., n-1] is only supported.");
+      }
+    }
     // load metadata
-    const size_t dim_metadata_size = src_sparsity->dim_metadata()->size();
-    if (dim_metadata_size != 2)
-      throw std::runtime_error("sparse tensor is supported only for 2D");
+    const int dim_metadata_size = src_sparsity->dim_metadata()->size();
+    auto dense_rank = shape.rank();
+    if (dense_rank + block_rank != dim_metadata_size)
+      throw std::runtime_error("sparsity dim_metadata length is wrong.");
+    bool random_sparsity = dim_metadata_size == 2 && block_rank == 0;
+    bool block2D_sparsity = dim_metadata_size == 4 && block_rank == 2;
+    if (dim_metadata_size != !random_sparsity && !block2D_sparsity)
+      throw std::runtime_error(
+          "sparsity is supported only for 2D tensor with random or 16x1 block sparsity.");
+
     const auto *src_metadata = src_sparsity->dim_metadata()->Get(0);
     if (src_metadata->format() != DimensionType::DimensionType_DENSE)
       throw std::runtime_error("sparse tensor dim[0] is not DENSE");
     src_metadata = src_sparsity->dim_metadata()->Get(1);
     if (src_metadata->format() != DimensionType::DimensionType_SPARSE_CSR)
       throw std::runtime_error("sparse tensor dim[0] is not SPARSE_CSR");
-
     auto ParseSparseIndexVector = [src_metadata, &w1_segments, &w1_indices]() {
       if (src_metadata->array_segments() == nullptr || src_metadata->array_indices() == nullptr)
         return false;
@@ -433,7 +426,17 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
     };
     if (ParseSparseIndexVector() == false)
       throw std::runtime_error("Error during parsing sparsity index information");
-    type_info.sparse2DMetadata(std::move(w1_segments), std::move(w1_indices));
+    // Get block size
+    std::vector<int32_t> block_size;
+    for (int i = 0; i < block_rank; ++i)
+    {
+      auto block_metadata = src_sparsity->dim_metadata()->Get(dense_rank + i);
+      if (block_metadata->format() != DimensionType::DimensionType_DENSE)
+        throw std::runtime_error("block dimension must be DENSE.");
+      block_size.push_back(block_metadata->dense_size());
+    }
+    type_info.sparsity(std::make_shared<ir::Sparsity>(std::move(w1_segments), std::move(w1_indices),
+                                                      std::move(block_size)));
   }
   // Create operand
   const auto operand_index = subg.addOperand(shape, type_info);
@@ -450,8 +453,28 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
     }
     else // Model is loaded(mmap'd) from a file
     {
-      data_obj = std::make_unique<ir::CachedData>(data->data(), data->size());
-      deallocateMmappedArea(const_cast<uint8_t *>(data->data()), data->size());
+      size_t data_size = data->size();
+      ptrdiff_t unaligned_offset_start = data->data() - _base;
+      ptrdiff_t offset_end = unaligned_offset_start + data_size;
+
+      // Calculated aligned offset from base address of mapped region
+      // munmap accepts memory address which is a multiple of the pagesize
+      ptrdiff_t aligned_offset_start = (unaligned_offset_start / _pagesize) * _pagesize;
+      size_t mmap_size = offset_end - aligned_offset_start;
+
+      if (_use_mmaped_data)
+      {
+        data_obj = std::make_unique<ir::MMapedData>(_fd, aligned_offset_start, mmap_size,
+                                                    unaligned_offset_start, data_size);
+      }
+      else
+      {
+        size_t offset = unaligned_offset_start - aligned_offset_start;
+        uint8_t *mmap_base = static_cast<uint8_t *>(
+            mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, _fd, aligned_offset_start));
+        data_obj = std::make_unique<ir::CachedData>(mmap_base + offset, data_size);
+        munmap(mmap_base, mmap_size);
+      }
     }
     subg.setOperandValue(operand_index, std::move(data_obj));
   }
@@ -465,10 +488,9 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
   return operand_index;
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadOperationIO(const Operator *op,
-                                                               ir::OperandIndexSequence &inputs,
-                                                               ir::OperandIndexSequence &outputs)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadOperationIO(const Operator *op, ir::OperandIndexSequence &inputs,
+                                               ir::OperandIndexSequence &outputs)
 {
   for (const std::int32_t idx : *op->inputs())
   {
@@ -490,120 +512,116 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperationIO(const Operator *o
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
+template <typename LoaderDomain>
 template <typename Param, typename OptionsType>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadStridesAndPaddings(Param &param,
-                                                                      const OptionsType *options)
+void BaseLoader<LoaderDomain>::loadStridesAndPaddings(Param &param, const OptionsType *options)
 {
   // Strides
   param.stride.vertical = options->stride_h();
   param.stride.horizontal = options->stride_w();
   // Paddings
-  if (options->padding() == Padding::Padding_SAME)
-    param.padding.type = ir::PaddingType::SAME;
-  if (options->padding() == Padding::Padding_VALID)
-    param.padding.type = ir::PaddingType::VALID;
+  switch (options->padding())
+  {
+    case Padding::Padding_SAME:
+      param.padding.type = ir::PaddingType::SAME;
+      break;
+    case Padding::Padding_VALID:
+      param.padding.type = ir::PaddingType::VALID;
+      break;
+    default:
+      throw std::runtime_error{"Invalid padding type"};
+  }
   // param paddings indexes unused
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
+template <typename LoaderDomain>
 template <typename Param>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPool2DOptions(Param &param,
-                                                                 const Pool2DOptions *options)
+void BaseLoader<LoaderDomain>::loadPool2DOptions(Param &param, const Pool2DOptions *options)
 {
   // Strides and Paddings
+  if (options->stride_h() <= 0 || options->stride_w() <= 0)
+    throw std::runtime_error{"Invalid stride vertical or horizontal - both must be bigger than 0"};
   loadStridesAndPaddings(param, options);
   // Filter width and height
   // Strides
+  if (options->filter_width() <= 0 || options->filter_height() <= 0)
+    throw std::runtime_error{"Invalid filter width or height - both must be bigger than 0"};
   param.kw = options->filter_width();
   param.kh = options->filter_height();
   // Activation
   param.activation = convertActivation(options->fused_activation_function());
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadConv2D(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+template <typename OpIR, typename... Args>
+const OpIR *BaseLoader<LoaderDomain>::loadOperationTo(const Operator *op, ir::Graph &subg,
+                                                      Args &&... args)
 {
+  static_assert(sizeof...(args) <= 1, "You can't have more than 1 arguments!");
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
 
   loadOperationIO(op, inputs, outputs);
 
+  std::unique_ptr<OpIR> new_op(new OpIR(inputs, outputs, std::forward<Args>(args)...));
+  auto ret = new_op.get();
+  subg.addOperation(std::move(new_op));
+
+  return ret;
+}
+
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadConv2D(const Operator *op, ir::Graph &subg)
+{
   ir::operation::Conv2D::Param param;
   const auto *options = op->builtin_options_as_Conv2DOptions();
   param.activation = convertActivation(options->fused_activation_function());
   loadStridesAndPaddings(param, options);
-
   param.dilation.width_factor = options->dilation_w_factor();
   param.dilation.height_factor = options->dilation_h_factor();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Conv2D(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Conv2D>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadDepthwiseConv2D(const Operator *op,
-                                                                   ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadDepthwiseConv2D(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::DepthwiseConv2D::Param param;
   const auto *options = op->builtin_options_as_DepthwiseConv2DOptions();
   param.activation = convertActivation(options->fused_activation_function());
   loadStridesAndPaddings(param, options);
-  // Multiplier
   param.multiplier = options->depth_multiplier();
   // Dilation h/w factor unused
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::DepthwiseConv2D(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+
+  loadOperationTo<ir::operation::DepthwiseConv2D>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadTransposeConv(const Operator *op,
-                                                                 ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadTransposeConv(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::TransposeConv::Param param;
   const auto *options = op->builtin_options_as_TransposeConvOptions();
   loadStridesAndPaddings(param, options);
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::TransposeConv(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+
+  loadOperationTo<ir::operation::TransposeConv>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPool2D(const Operator *op, ir::Graph &subg,
-                                                          ir::operation::Pool2D::PoolType op_type)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadPool2D(const Operator *op, ir::Graph &subg,
+                                          ir::operation::Pool2D::PoolType op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Pool2D::Param param;
   param.op_type = op_type;
   const auto *options = op->builtin_options_as_Pool2DOptions();
 
   loadPool2DOptions(param, options);
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pool2D(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Pool2D>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReshape(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadReshape(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Reshape::Param param{};
   const auto *options = op->builtin_options_as_ReshapeOptions();
   if (options != nullptr)
@@ -611,99 +629,64 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadReshape(const Operator *op, i
     const auto *new_shape = options->new_shape();
     if (new_shape)
     {
-      for (uint i = 0; i < new_shape->Length(); ++i)
+      for (uint i = 0; i < new_shape->size(); ++i)
       {
         param.new_shape.push_back(new_shape->Get(i));
       }
     }
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reshape(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Reshape>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSoftmax(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSoftmax(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Softmax::Param param;
   const auto *options = op->builtin_options_as_SoftmaxOptions();
   // Beta
   param.beta = options->beta();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Softmax(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Softmax>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadConcatenation(const Operator *op,
-                                                                 ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadConcatenation(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Concat::Param param;
   const auto *options = op->builtin_options_as_ConcatenationOptions();
   // Axis
   param.axis = options->axis();
   // activation unused
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Concat(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Concat>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadFill(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadFC(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Fill(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
+  ir::operation::FullyConnected::Param param;
+  const auto *options = op->builtin_options_as_FullyConnectedOptions();
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadFC(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
+  param.activation = convertActivation(options->fused_activation_function());
+  // weights_format unused
 
-  loadOperationIO(op, inputs, outputs);
+  const auto fc = loadOperationTo<ir::operation::FullyConnected>(op, subg, param);
 
-  const auto &input_operand = subg.operands().at(inputs.at(ir::operation::FullyConnected::INPUT));
-  auto &weights_operand = subg.operands().at(inputs.at(ir::operation::FullyConnected::WEIGHT));
+  const auto &input_operand =
+      subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::INPUT));
+  auto &weights_operand =
+      subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::WEIGHT));
   if (input_operand.typeInfo().type() == ir::DataType::FLOAT32 &&
       weights_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM)
   {
     weights_operand.type(ir::DataType::QUANT_INT8_SYMM);
   }
-
-  ir::operation::FullyConnected::Param param;
-  const auto *options = op->builtin_options_as_FullyConnectedOptions();
-
-  param.activation = convertActivation(options->fused_activation_function());
-  // weights_format unused
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::FullyConnected(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadAddV2(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadAddV2(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::BinaryArithmetic::Param param;
   param.arithmetic_type = ir::operation::BinaryArithmetic::ArithmeticType::ADD;
 
@@ -722,21 +705,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadAddV2(const Operator *op, ir:
     param.activation = convertActivation(fused_activation_func);
   }
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::BinaryArithmetic(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::BinaryArithmetic>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-template <ir::operation::BinaryArithmetic::ArithmeticType op_type>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBinaryArithmetic(const Operator *op,
-                                                                    ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadBinaryArithmetic(
+    const Operator *op, ir::Graph &subg, ir::operation::BinaryArithmetic::ArithmeticType op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::BinaryArithmetic::Param param;
   param.arithmetic_type = op_type;
   switch (op_type)
@@ -771,172 +746,66 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBinaryArithmetic(const Operat
       break;
   }
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::BinaryArithmetic(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::BinaryArithmetic>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPack(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadPack(const Operator *op, ir::Graph &subg)
 {
-  // This runtime_error will be removed if the one of backend supports this operation
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Pack::Param param;
   const auto *options = op->builtin_options_as_PackOptions();
   param.num = options->values_count();
   param.axis = options->axis();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pack(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Pack>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseActivation(
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadElementwiseActivation(
     const Operator *op, ir::Graph &subg, ir::operation::ElementwiseActivation::Type op_type,
     float alpha, float beta)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::ElementwiseActivation::Param param;
   param.op_type = op_type;
   param.alpha = alpha;
   param.beta = beta;
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ElementwiseActivation(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ElementwiseActivation>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeBilinear(const Operator *op,
-                                                                  ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadResizeBilinear(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto size = inputs.at(1);
-
-  // FIXME Handle ResizeBilinearOptions.
-  if (!subg.operands().at(size).isConstant())
-    throw std::runtime_error("ResizeBilinear: non-constant 'size' is not supported.");
-
-  std::vector<std::int32_t> size_v = subg.operands().at(size).template asVector<std::int32_t>();
-
   ir::operation::ResizeBilinear::Param param;
-  param.height_out = size_v[0];
-  param.width_out = size_v[1];
   param.align_corners = op->builtin_options_as_ResizeBilinearOptions()->align_corners();
   param.half_pixel_centers = op->builtin_options_as_ResizeBilinearOptions()->half_pixel_centers();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::ResizeBilinear({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ResizeBilinear>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeNearestNeighbor(const Operator *op,
-                                                                         ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto size = inputs.at(1);
-
-  if (!subg.operands().at(size).isConstant())
-    throw std::runtime_error("ResizeNearestNeighbor: non-constant 'size' is not supported.");
-
-  std::vector<std::int32_t> size_v = subg.operands().at(size).template asVector<std::int32_t>();
-
   ir::operation::ResizeNearestNeighbor::Param param;
-  param.height_out = size_v[0];
-  param.width_out = size_v[1];
   param.align_corners = op->builtin_options_as_ResizeNearestNeighborOptions()->align_corners();
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ResizeNearestNeighbor({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSelect(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Select(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ResizeNearestNeighbor>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSquaredDifference(const Operator *op,
-                                                                     ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadReduce(const Operator *op, ir::Graph &subg,
+                                          ir::operation::Reduce::ReduceType reduce_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::SquaredDifference(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadTranspose(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto perm = inputs.at(1);
-
-  if (!subg.operands().at(perm).isConstant())
-    throw std::runtime_error("Transpose: non-constant 'perm' is not supported.");
-
-  ir::operation::Transpose::Param param;
-  param.perm = subg.operands().at(perm).template asVector<int>();
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Transpose({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-template <ir::operation::Reduce::ReduceType reduce_type>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReduce(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Reduce::Param param;
   param.reduce_type = reduce_type;
   param.keep_dims = op->builtin_options_as_ReducerOptions()->keep_dims();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reduce(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Reduce>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReduceAll(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadReduceAll(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Reduce::Param param;
   param.reduce_type = ir::operation::Reduce::ReduceType::ALL;
   if (op->custom_options() == nullptr)
@@ -952,64 +821,28 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadReduceAll(const Operator *op,
     param.keep_dims = attr_map["keep_dims"].AsBool();
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reduce(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadReverseV2(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Reverse(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPad(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pad(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Reduce>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-template <ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseBinary(const Operator *op,
-                                                                     ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadElementwiseBinary(
+    const Operator *op, ir::Graph &subg,
+    ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::ElementwiseBinary::Param param;
   param.op_type = op_type;
 
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ElementwiseBinary(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::ElementwiseBinary>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseUnary(
-    const Operator *op, ir::Graph &subg, ir::operation::ElementwiseUnary::Type op_type)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadElementwiseUnary(const Operator *op, ir::Graph &subg,
+                                                    ir::operation::ElementwiseUnary::Type op_type)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::ElementwiseUnary::Param param;
   param.op_type = op_type;
 
+  const auto eu = loadOperationTo<ir::operation::ElementwiseUnary>(op, subg, param);
   if (op_type == ir::operation::ElementwiseUnary::Type::CAST)
   {
     auto qasymm8ToUint8 = [](ir::Operand &operand) {
@@ -1018,61 +851,24 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseUnary(
         operand.type(ir::DataType::UINT8);
       }
     };
-    qasymm8ToUint8(subg.operands().at(inputs.at(ir::operation::ElementwiseUnary::Input::INPUT)));
-    qasymm8ToUint8(subg.operands().at(outputs.at(0)));
+    qasymm8ToUint8(
+        subg.operands().at(eu->getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)));
+    qasymm8ToUint8(subg.operands().at(eu->getOutputs().at(0)));
   }
-
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::ElementwiseUnary(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadExpandDims(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::ExpandDims(inputs, outputs));
-  subg.addOperation(std::move(new_op));
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadGather(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadGather(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::Gather::Param param;
   param.axis = op->builtin_options_as_GatherOptions()->axis();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Gather(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToBatchND(const Operator *op,
-                                                                  ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::SpaceToBatchND{inputs, outputs}};
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Gather>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchMatMul(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadBatchMatMul(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::BatchMatMul::Param param;
 
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
@@ -1105,89 +901,21 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchMatMul(const Operator *o
           " as " + EnumNameBuiltinOperator(BuiltinOperator::BuiltinOperator_BATCH_MATMUL));
   }
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchMatMul{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchToSpaceND(const Operator *op,
-                                                                  ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchToSpaceND{inputs, outputs}};
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadMatrixBandPart(const Operator *op,
-                                                                  ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::MatrixBandPart(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::BatchMatMul>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
   ir::operation::SpaceToDepth::Param param;
-
   const auto *options = op->builtin_options_as_SpaceToDepthOptions();
-
   param.block_size = options->block_size();
 
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadStatelessRandomUniform(const Operator *op,
-                                                                          ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::StatelessRandomUniform(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::SpaceToDepth>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadRank(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Rank(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
@@ -1237,7 +965,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
         loadReduceAll(op, subg);
         break;
       case BuiltinOP::MatrixBandPart:
-        loadMatrixBandPart(op, subg);
+        loadOperationTo<ir::operation::MatrixBandPart>(op, subg);
         break;
       case BuiltinOP::BatchMatMul:
         loadBatchMatMul(op, subg);
@@ -1246,13 +974,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
         loadEinsum(op, subg);
         break;
       case BuiltinOP::BroadcastTo:
-        loadBroadcastTo(op, subg);
+        loadOperationTo<ir::operation::BroadcastTo>(op, subg);
         break;
       case BuiltinOP::FusedBatchNorm:
         loadFusedBatchNorm(op, subg);
         break;
       case BuiltinOP::StatelessRandomUniform:
-        loadStatelessRandomUniform(op, subg);
+        loadOperationTo<ir::operation::StatelessRandomUniform>(op, subg);
         break;
       case BuiltinOP::Erf:
         loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ERF);
@@ -1285,141 +1013,71 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSqueeze(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSqueeze(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  ir::operation::Squeeze::Param param{};
+  ir::operation::Squeeze::Param param;
   const auto *options = op->builtin_options_as_SqueezeOptions();
   const auto *dims = options->squeeze_dims();
   if (dims)
   {
-    if (dims->Length() > sizeof(param.dims) / sizeof(param.dims[0]))
+    if (dims->size() > sizeof(param.dims) / sizeof(param.dims[0]))
       throw std::runtime_error("Squeeze: 'param.ndims' is out of range.");
-    param.ndim = dims->Length();
+    param.ndim = dims->size();
     for (int i = 0; i < param.ndim; ++i)
       param.dims[i] = dims->Get(i);
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Squeeze(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Squeeze>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPrelu(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSplit(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::PReLU(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSplit(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-  // Notice : input order is strange for tflite split
-  auto input = inputs.at(1);
-  auto axis = inputs.at(0);
-
-  // FIXME Handle SplitOptions.
-  if (!subg.operands().at(axis).isConstant())
-    throw std::runtime_error("Split: non-constant 'axis' is not supported.");
-
-  ir::operation::Split::Param param{};
-  param.axis = subg.operands().at(axis).template asScalar<int>();
+  ir::operation::Split::Param param;
   const auto *options = op->builtin_options_as_SplitOptions();
   param.num_splits = options->num_splits();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Split({input}, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Split>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSplitV(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSplitV(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  ir::operation::SplitV::Param param{};
-
+  ir::operation::SplitV::Param param;
   const auto *options = op->builtin_options_as_SplitVOptions();
   param.num_splits = options->num_splits();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::SplitV(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadSlice(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::Slice{inputs, outputs}};
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::SplitV>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadStridedSlice(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadStridedSlice(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::StridedSlice::Param param;
-
   const auto *options = op->builtin_options_as_StridedSliceOptions();
   param.begin_mask = options->begin_mask();
   param.end_mask = options->end_mask();
   param.shrink_axis_mask = options->shrink_axis_mask();
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::StridedSlice{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::StridedSlice>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadUnpack(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadUnpack(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Unpack::Param param;
   const auto *options = op->builtin_options_as_UnpackOptions();
   param.num = options->num();
   param.axis = options->axis();
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Unpack(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Unpack>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadComparison(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadComparison(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::Comparison::Param param;
-
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
 
   switch (builtin_op)
@@ -1447,24 +1105,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadComparison(const Operator *op
           std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
   }
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Comparison(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::Comparison>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadEinsum(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadEinsum(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::Einsum::Param param;
-
-  if (inputs.size() != 2)
-  {
-    throw std::runtime_error{"Einsum: NYI input - only support two inputs"};
-  }
-
   if (op->custom_options() == nullptr)
   {
     throw std::runtime_error{"Einsum: empty equation"};
@@ -1478,24 +1125,16 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadEinsum(const Operator *op, ir
     param.equation = attr_map["equation"].ToString();
   }
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::Einsum{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
+  const auto es = loadOperationTo<ir::operation::Einsum>(op, subg, param);
+  if (es->getInputs().size() != 2)
+  {
+    throw std::runtime_error{"Einsum: NYI input - only support two inputs"};
+  }
 }
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadFusedBatchNorm(const Operator *op,
-                                                                  ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadFusedBatchNorm(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
   ir::operation::FusedBatchNorm::Param param;
-
-  if (inputs.size() != 5)
-  {
-    throw std::runtime_error{"FusedBatchNorm: NYI input - only support five inputs"};
-  }
-
   if (op->custom_options() == nullptr)
   {
     throw std::runtime_error{"FusedBatchNorm: empty option"};
@@ -1511,195 +1150,104 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadFusedBatchNorm(const Operator
     param.data_format = attr_map["data_format"].ToString();
   }
 
-  std::unique_ptr<ir::Operation> new_op{new ir::operation::FusedBatchNorm{inputs, outputs, param}};
-  subg.addOperation(std::move(new_op));
+  const auto fbn = loadOperationTo<ir::operation::FusedBatchNorm>(op, subg, param);
+
+  if (fbn->getInputs().size() != 5)
+  {
+    throw std::runtime_error{"FusedBatchNorm: NYI input - only support five inputs"};
+  }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadOneHot(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadOneHot(const Operator *op, ir::Graph &subg)
 {
   if (op->inputs()->size() != 4 || op->outputs()->size() != 1)
     throw std::runtime_error("OneHot Op has wrong number of input or output tensors.");
 
-  // Set input and output tensors
-  ir::OperandIndexSequence inputs, outputs;
-  loadOperationIO(op, inputs, outputs);
-
   // Set parameter
-  const auto axis = op->builtin_options_as_OneHotOptions()->axis();
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::OneHot(inputs, outputs, {axis}));
-  subg.addOperation(std::move(new_op));
-}
+  ir::operation::OneHot::Param param;
+  param.axis = op->builtin_options_as_OneHotOptions()->axis();
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadShape(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  // ir::operation::Shape::Param param;
-  // const auto *options = op->builtin_options_as_ShapeOptions();
-  // param.out_type = tensorTypeToDataType(options->out_type());
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Shape(inputs, outputs /*, param*/));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::OneHot>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadIf(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadIf(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
+  const auto *options = op->builtin_options_as_IfOptions();
+  const int32_t then_index = options->then_subgraph_index();
+  const int32_t else_index = options->else_subgraph_index();
 
-  loadOperationIO(op, inputs, outputs);
+  verifySubgraphIndex(then_index);
+  verifySubgraphIndex(else_index);
 
   ir::operation::If::Param param;
-  const auto *options = op->builtin_options_as_IfOptions();
-  const uint32_t then_index = options->then_subgraph_index();
-  const uint32_t else_index = options->else_subgraph_index();
-  param.then_subg_index = ir::SubgraphIndex{then_index};
-  param.else_subg_index = ir::SubgraphIndex{else_index};
+  param.then_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(then_index)};
+  param.else_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(else_index)};
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::If(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::If>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadWhile(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadWhile(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
+  const auto *options = op->builtin_options_as_WhileOptions();
+  const int32_t cond_index = options->cond_subgraph_index();
+  const int32_t body_index = options->body_subgraph_index();
 
-  loadOperationIO(op, inputs, outputs);
+  verifySubgraphIndex(cond_index);
+  verifySubgraphIndex(body_index);
 
   ir::operation::While::Param param;
-  const auto *options = op->builtin_options_as_WhileOptions();
-  const uint32_t cond_index = options->cond_subgraph_index();
-  const uint32_t body_index = options->body_subgraph_index();
-  param.cond_subg_index = ir::SubgraphIndex{cond_index};
-  param.body_subg_index = ir::SubgraphIndex{body_index};
+  param.cond_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(cond_index)};
+  param.body_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(body_index)};
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::While(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::While>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadArgMax(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadArgMax(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  auto inputOperand = subg.operands().at(inputs.at(0));
-  auto axisOperand = subg.operands().at(inputs.at(1));
-
-  if (!axisOperand.isConstant())
-    throw std::runtime_error("ArgMax: non-constant 'axis' is not supported.");
-  if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 ||
-                                           axisOperand.typeInfo().type() == ir::DataType::INT64)))
-    throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported.");
-
   ir::operation::ArgMax::Param param;
-  param.axis = axisOperand.template asVector<int>()[0];
   const auto output_type = op->builtin_options_as_ArgMaxOptions()->output_type();
   switch (output_type)
   {
     case TensorType::TensorType_INT32:
     case TensorType::TensorType_INT64:
+      param.output_type = tensorTypeToDataType(output_type);
       break;
     default:
       throw std::runtime_error("ArgMax: `output_type` must be either int32 or int64.");
   }
-  param.output_type = tensorTypeToDataType(output_type);
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::ArgMax(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
+  auto am = loadOperationTo<ir::operation::ArgMax>(op, subg, param);
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadPow(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Pow(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadRange(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Range(inputs, outputs));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadTile(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  auto multiples = inputs.at(ir::operation::Tile::MULTIPLES);
-
-  if (!subg.operands().at(multiples).isConstant())
-    throw std::runtime_error("Tile: non-constant 'multiples' is not supported.");
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::Tile(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  auto &axisOperand = subg.operands().at(am->getInputs().at(ir::operation::ArgMax::Input::AXIS));
+  if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 ||
+                                           axisOperand.typeInfo().type() == ir::DataType::INT64)))
+    throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported.");
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadLogSoftmax(const Operator *op, ir::Graph &subg)
 {
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
   ir::operation::LogSoftmax::Param param;
-
   // In tflite, beta is fixed to 1.0 and axis is fixed to -1.
   param.beta = 1.0f;
   param.axis = -1;
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::LogSoftmax(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadL2Normalization(const Operator *op,
-                                                                   ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::L2Normalization(inputs, outputs));
-  subg.addOperation(std::move(new_op));
+  loadOperationTo<ir::operation::LogSoftmax>(op, subg, param);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadLeakyRelu(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadLeakyRelu(const Operator *op, ir::Graph &subg)
 {
   float alpha = op->builtin_options_as_LeakyReluOptions()->alpha();
   loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::LEAKY_RELU, alpha,
                             1.f);
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg)
 {
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
 
@@ -1733,16 +1281,16 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadFC(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_ADD:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::ADD>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::ADD);
       return;
     case BuiltinOperator::BuiltinOperator_SUB:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::SUB>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::SUB);
       return;
     case BuiltinOperator::BuiltinOperator_MUL:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::MUL>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::MUL);
       return;
     case BuiltinOperator::BuiltinOperator_DIV:
-      loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::DIV>(op, subg);
+      loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::DIV);
       return;
     case BuiltinOperator::BuiltinOperator_PACK:
       loadPack(op, subg);
@@ -1769,40 +1317,37 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::RSQRT);
       return;
     case BuiltinOperator::BuiltinOperator_SELECT:
-      loadSelect(op, subg);
-      return;
     case BuiltinOperator::BuiltinOperator_SELECT_V2:
-      // Use same loader with BuiltinOperator_SELECT
-      loadSelect(op, subg);
+      loadOperationTo<ir::operation::Select>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SQRT:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SQRT);
       return;
     case BuiltinOperator::BuiltinOperator_SQUARED_DIFFERENCE:
-      loadSquaredDifference(op, subg);
+      loadOperationTo<ir::operation::SquaredDifference>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_TANH:
       loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::TANH, 1.f,
                                 1.f);
       return;
     case BuiltinOperator::BuiltinOperator_TRANSPOSE:
-      loadTranspose(op, subg);
+      loadOperationTo<ir::operation::Transpose>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_MEAN:
-      loadReduce<ir::operation::Reduce::ReduceType::MEAN>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::MEAN);
       return;
     case BuiltinOperator::BuiltinOperator_REDUCE_ANY:
-      loadReduce<ir::operation::Reduce::ReduceType::ANY>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::ANY);
       return;
     case BuiltinOperator::BuiltinOperator_REDUCE_MAX:
-      loadReduce<ir::operation::Reduce::ReduceType::MAX>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::MAX);
       return;
     case BuiltinOperator::BuiltinOperator_REVERSE_V2:
-      loadReverseV2(op, subg);
+      loadOperationTo<ir::operation::Reverse>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_PAD:
     case BuiltinOperator::BuiltinOperator_PADV2:
-      loadPad(op, subg);
+      loadOperationTo<ir::operation::Pad>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_LOGISTIC:
       loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::LOGISTIC);
@@ -1811,19 +1356,19 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::EXP);
       return;
     case BuiltinOperator::BuiltinOperator_EXPAND_DIMS:
-      loadExpandDims(op, subg);
+      loadOperationTo<ir::operation::ExpandDims>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_GATHER:
       loadGather(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SPACE_TO_BATCH_ND:
-      loadSpaceToBatchND(op, subg);
+      loadOperationTo<ir::operation::SpaceToBatchND>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_BATCH_TO_SPACE_ND:
-      loadBatchToSpaceND(op, subg);
+      loadOperationTo<ir::operation::BatchToSpaceND>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SUM:
-      loadReduce<ir::operation::Reduce::ReduceType::SUM>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::SUM);
       return;
     case BuiltinOperator::BuiltinOperator_CUSTOM:
       loadCustom(op, subg);
@@ -1832,7 +1377,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadSqueeze(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_PRELU:
-      loadPrelu(op, subg);
+      loadOperationTo<ir::operation::PReLU>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SPLIT:
       loadSplit(op, subg);
@@ -1841,7 +1386,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadSplitV(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_SLICE:
-      loadSlice(op, subg);
+      loadOperationTo<ir::operation::Slice>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_STRIDED_SLICE:
       loadStridedSlice(op, subg);
@@ -1850,10 +1395,10 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadUnpack(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_MINIMUM:
-      loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN>(op, subg);
+      loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN);
       return;
     case BuiltinOperator::BuiltinOperator_MAXIMUM:
-      loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX>(op, subg);
+      loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX);
       return;
     case BuiltinOperator::BuiltinOperator_CAST:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::CAST);
@@ -1879,10 +1424,10 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SIN);
       return;
     case BuiltinOperator::BuiltinOperator_SHAPE:
-      loadShape(op, subg);
+      loadOperationTo<ir::operation::Shape>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_REDUCE_PROD:
-      loadReduce<ir::operation::Reduce::ReduceType::PROD>(op, subg);
+      loadReduce(op, subg, ir::operation::Reduce::ReduceType::PROD);
       return;
     case BuiltinOperator::BuiltinOperator_IF:
       loadIf(op, subg);
@@ -1903,26 +1448,26 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ROUND);
       return;
     case BuiltinOperator::BuiltinOperator_POW:
-      loadPow(op, subg);
+      loadOperationTo<ir::operation::Pow>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_LOGICAL_NOT:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::LOGICAL_NOT);
       return;
     case BuiltinOperator::BuiltinOperator_LOGICAL_OR:
-      loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR>(
-          op, subg);
+      loadElementwiseBinary(op, subg,
+                            ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR);
       return;
     case BuiltinOperator::BuiltinOperator_FILL:
-      loadFill(op, subg);
+      loadOperationTo<ir::operation::Fill>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_ZEROS_LIKE:
       loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ZEROS_LIKE);
       return;
     case BuiltinOperator::BuiltinOperator_TILE:
-      loadTile(op, subg);
+      loadOperationTo<ir::operation::Tile>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_RANGE:
-      loadRange(op, subg);
+      loadOperationTo<ir::operation::Range>(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_BATCH_MATMUL:
       loadBatchMatMul(op, subg);
@@ -1937,13 +1482,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
       loadSpaceToDepth(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_L2_NORMALIZATION:
-      loadL2Normalization(op, subg);
+      loadOperationTo<ir::operation::L2Normalization>(op, subg);
       break;
     case BuiltinOperator::BuiltinOperator_LEAKY_RELU:
       loadLeakyRelu(op, subg);
       return;
     case BuiltinOperator::BuiltinOperator_RANK:
-      loadRank(op, subg);
+      loadOperationTo<ir::operation::Rank>(op, subg);
       return;
     default:
       throw std::runtime_error(
@@ -1951,8 +1496,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
   }
 }
 
-template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadModel()
+template <typename LoaderDomain> void BaseLoader<LoaderDomain>::loadModel()
 {
   LoaderDomain::VerifyModelBuffer(*_verifier.get());
   _model = LoaderDomain::GetModel(_base);
@@ -1967,8 +1511,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadModel()
   auto subgraphs = std::make_unique<ir::Subgraphs>();
   for (uint32_t subgraph_index = 0; subgraph_index < domain_subgraphs->size(); ++subgraph_index)
   {
-    auto subg =
-        static_cast<SpecificLoader *>(this)->loadSubgraph((*_model->subgraphs())[subgraph_index]);
+    auto subg = loadSubgraph((*_model->subgraphs())[subgraph_index]);
     subgraphs->push(ir::SubgraphIndex{subgraph_index}, std::move(subg));
   }
   _subgraphs = std::move(subgraphs);
diff --git a/runtime/onert/frontend/circle/CMakeLists.txt b/runtime/onert/frontend/circle/CMakeLists.txt
index 8bcf85dd3..76dca9989 100644
--- a/runtime/onert/frontend/circle/CMakeLists.txt
+++ b/runtime/onert/frontend/circle/CMakeLists.txt
@@ -8,7 +8,7 @@ add_library(circle_loader SHARED ${CIRCLE_LOADER_SOURCES})
 
 target_include_directories(circle_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
-target_link_libraries(circle_loader PUBLIC onert_core)
+target_link_libraries(circle_loader PRIVATE onert_core)
 target_link_libraries(circle_loader PRIVATE base_loader nnfw_common nnfw_coverage)
 target_link_libraries(circle_loader PRIVATE circle_schema)
 
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 92a9ee7a5..4565ffc00 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -69,7 +69,7 @@ struct LoaderDomain
   static bool VerifyModelBuffer(Verifier &verifier) { return circle::VerifyModelBuffer(verifier); }
 };
 
-class CircleLoader final : public base_loader::BaseLoader<LoaderDomain, CircleLoader>
+class CircleLoader final : public base_loader::BaseLoader<LoaderDomain>
 {
 protected:
   void loadInstanceNorm(const Operator *op, ir::Graph &subg);
@@ -91,7 +91,8 @@ public:
     }
   }
 
-  std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg)
+private:
+  std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg) override
   {
     auto subg = std::make_unique<ir::Graph>();
     // Load tensors
diff --git a/runtime/onert/frontend/nnapi/execution.cc b/runtime/onert/frontend/nnapi/execution.cc
index ce7da579e..56ca5ef00 100644
--- a/runtime/onert/frontend/nnapi/execution.cc
+++ b/runtime/onert/frontend/nnapi/execution.cc
@@ -94,12 +94,36 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution *execution, int32
 
   // Omitted optional input
   // LSTM operation's some inputs can be optional input
+  // Transpose operation's permutation input can be optional input
   if ((buffer == nullptr) && (length == 0))
   {
+    uint32_t dims[1] = {0};
+    ANeuralNetworksOperandType compared_shape;
+    compared_shape.dimensionCount = 1;
+    compared_shape.dimensions = dims;
     if (execution->hasUnspecifiedDims(operand_index))
     {
       return ANEURALNETWORKS_NO_ERROR;
     }
+    else if (type == nullptr && execution->IsOptionalInput(operand_index))
+    {
+      if (!execution->setOptionalInput(index, type, buffer, length))
+      {
+        VERBOSE(NNAPI::Execution) << "setInput: Fail to set optional input" << std::endl;
+        return ANEURALNETWORKS_BAD_DATA;
+      }
+      return ANEURALNETWORKS_NO_ERROR;
+    }
+    // TODO Changes the condition to check zero sized
+    else if (execution->compareShape(&compared_shape, operand_index))
+    {
+      if (!execution->setInput(index, type, buffer, length))
+      {
+        VERBOSE(NNAPI::Execution) << "setInput: Fail to set input" << std::endl;
+        return ANEURALNETWORKS_BAD_DATA;
+      }
+      return ANEURALNETWORKS_NO_ERROR;
+    }
     else
     {
       VERBOSE(NNAPI::Execution) << "setInput: Cannot handle fully-specified shape on model build "
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
index eb12d7e76..6114b74b0 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc
@@ -98,6 +98,17 @@ bool ANeuralNetworksExecution::compareShape(const ANeuralNetworksOperandType *ty
   return operand_shape == shape_from_type;
 }
 
+bool ANeuralNetworksExecution::IsOptionalInput(const onert::ir::OperandIndex index) noexcept
+{
+  const auto &operand_shape = _execution->primary_subgraph().operands().at(index).shape();
+  for (int32_t i = 0; i < operand_shape.rank(); ++i)
+  {
+    if (operand_shape.dim(i) != 0)
+      return false;
+  }
+  return true;
+}
+
 bool ANeuralNetworksExecution::hasUnspecifiedDims(const onert::ir::OperandIndex index) noexcept
 {
   const auto operand_shape = _execution->primary_subgraph().operands().at(index).shape();
@@ -148,6 +159,45 @@ bool ANeuralNetworksExecution::setInput(uint32_t index, const ANeuralNetworksOpe
   return true;
 }
 
+bool ANeuralNetworksExecution::setOptionalInput(uint32_t index,
+                                                const ANeuralNetworksOperandType *type,
+                                                const void *buffer, size_t length) noexcept
+{
+  assert(type == nullptr);
+  assert(buffer == nullptr);
+  assert(length == 0);
+  try
+  {
+    onert::ir::IOIndex input_index{index};
+    const auto operand_index = getInputOperandIndex(index);
+
+    const auto type_info = _execution->primary_subgraph().operands().at(operand_index).typeInfo();
+    const auto shape = (type != nullptr)
+                           ? NNAPIConvert::getShape(type)
+                           : _execution->primary_subgraph().operands().at(operand_index).shape();
+
+    // ANeuralNetworksExecution::setInput() uses only shape information
+    ANeuralNetworksOperandType optional_input_type;
+    optional_input_type.dimensionCount = shape.rank();
+    std::vector<uint32_t> dims(optional_input_type.dimensionCount);
+    for (uint32_t i = 0; i < optional_input_type.dimensionCount; ++i)
+    {
+      dims.at(i) = shape.dim(i);
+    }
+    optional_input_type.dimensions = dims.data();
+
+    return setInput(index, &optional_input_type, buffer, length);
+  }
+  catch (const std::exception &e)
+  {
+    VERBOSE(EXCEPTION) << e.what() << std::endl;
+
+    return false;
+  }
+
+  return true;
+}
+
 bool ANeuralNetworksExecution::setOutput(uint32_t index, const ANeuralNetworksOperandType *type,
                                          void *buffer, size_t length) noexcept
 {
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
index 848ae743f..1f4b868f6 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
@@ -35,6 +35,8 @@ public:
 public:
   bool setInput(uint32_t index, const ANeuralNetworksOperandType *type, const void *buffer,
                 size_t length) noexcept;
+  bool setOptionalInput(uint32_t index, const ANeuralNetworksOperandType *type, const void *buffer,
+                        size_t length) noexcept;
   bool setOutput(uint32_t index, const ANeuralNetworksOperandType *type, void *buffer,
                  size_t length) noexcept;
   bool startExecute(void) noexcept;
@@ -46,6 +48,7 @@ public:
                        const onert::ir::OperandIndex index) noexcept;
   bool compareShape(const ANeuralNetworksOperandType *type,
                     const onert::ir::OperandIndex index) noexcept;
+  bool IsOptionalInput(const onert::ir::OperandIndex index) noexcept;
   bool hasUnspecifiedDims(const onert::ir::OperandIndex index) noexcept;
   size_t getOperandSize(const onert::ir::OperandIndex index) noexcept;
   const std::shared_ptr<onert::exec::Execution> instance(void) noexcept;
diff --git a/runtime/onert/frontend/nnapi/ANeuralNetworksModel.test.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.test.cc
index 15a279a7e..bb42f2b08 100644
--- a/runtime/onert/frontend/nnapi/ANeuralNetworksModel.test.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.test.cc
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 
-#include "wrapper/ANeuralNetworksModel.h"
+#include "ANeuralNetworksModel.h"
 
-TEST(MODEL, model_build)
+TEST(MODEL, neg_model_build)
 {
   ANeuralNetworksModel model;
-  ASSERT_EQ(model.isFinished(), false);
+  ASSERT_FALSE(model.isFinished());
 }
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
index 8e3d83db4..e6c38f5f8 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
@@ -708,31 +708,7 @@ OperationFactory::OperationFactory()
     return new operation::StridedSlice{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_TRANSPOSE] = [](const OperationFactory::Param &init_param,
-                                       Operands &operands) {
-    // TODO make this work with init_param.input_count == 1 (when permutation vector is optional)
-
-    // Inputs
-    // 0: An n-D tensor, specifying the tensor to be transposed.
-    // 1: An optional 1-D Tensor of {@link ANEURALNETWORKS_TENSOR_INT32},
-    //    the permutation of the dimensions of the input tensor.
-    //    The returned tensor's dimension i corresponds to the input dimension
-    //    perm[i]. If perm is not given, it is set to (n-1...0), where n is the
-    //    rank of the input tensor. Hence by default, this operation performs a
-    //    regular matrix transpose on 2-D input Tensors.
-    assert(init_param.input_count == 2);
-    assert(init_param.output_count == 1);
-
-    OperandIndexSequence inputs{init_param.inputs[0]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-    std::vector<std::int32_t> perm =
-        operands.at(OperandIndex{init_param.inputs[1]}).asVector<std::int32_t>();
-
-    operation::Transpose::Param param;
-    param.perm.assign(perm.cbegin(), perm.cend());
-
-    return new operation::Transpose{inputs, outputs, param};
-  };
+  _map[ANEURALNETWORKS_TRANSPOSE] = createSimpleBinaryOp<operation::Transpose>;
 
   _map[ANEURALNETWORKS_MUL] =
       getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::MUL);
@@ -982,6 +958,28 @@ OperationFactory::OperationFactory()
     return new operation::ResizeBilinear{inputs, outputs, param};
   };
 
+  _map[ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR] = [](const OperationFactory::Param &init_param,
+                                                     Operands &operands) {
+    assert((init_param.input_count == 3 || init_param.input_count == 4) &&
+           init_param.output_count == 1);
+
+    OperandIndexSequence outputs{init_param.outputs[0]};
+
+    // Each input should be interpreted as follows:
+    //
+    //  0 -> IFM Index
+    //  1 -> Height Index
+    //  2 -> Width Index
+    OperandIndexSequence inputs{init_param.inputs[0]};
+
+    operation::ResizeNearestNeighbor::Param param;
+    param.height_out = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<int32_t>();
+    param.width_out = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<int32_t>();
+    param.align_corners = false;
+    // The layout input is not supported yet
+    return new operation::ResizeNearestNeighbor{inputs, outputs, param};
+  };
+
   _map[ANEURALNETWORKS_RELU1] = getElementwiseActivationGenerator(
       onert::ir::operation::ElementwiseActivation::Type::RELU, 1.f, -1.f);
 
@@ -1304,6 +1302,105 @@ OperationFactory::OperationFactory()
     }
     param.cell_threshold = operands.at(OperandIndex{init_param.inputs[21]}).asScalar<float>();
     param.projection_threshold = operands.at(OperandIndex{init_param.inputs[22]}).asScalar<float>();
+    // This is initialization to prevent warning or error by static code analyzer. LSTM operation
+    // does not need time_major
+    param.time_major = false;
+
+    return new operation::LSTM{inputs, outputs, param};
+  };
+
+  _map[ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM] = [](const OperationFactory::Param &init_param,
+                                                          Operands &operands) {
+    assert((init_param.input_count >= 24 || init_param.input_count <= 28) &&
+           (init_param.output_count >= 1 && init_param.output_count <= 3));
+
+    // Each input should be interpreted as follows:
+    //
+    // 0 -> Input Tensor Index
+    // 1 -> Input to Input Tensor Index
+    // 2 -> Input to Forget Tensor Index
+    // 3 -> Input to Cell Tensor Index
+    // 4 -> Input to Output Tensor Index
+    // 5 -> Recurrent to Input Weights Tensor Index
+    // 6 -> Recurrent to Forget Weights Tensor Index
+    // 7 -> Recurrent to Cell Weights Tensor Index
+    // 8 -> Recurrent to Output Weights Tensor Index
+    // 9 -> Cell to Input Weights Tensor Index
+    // 10 -> Cell to Forget Weights Tensor Index
+    // 11 -> Cell to Output Weights Tensor Index
+    // 12 -> Input Gate Bias Tensor Index
+    // 13 -> Forget Gate Bias Tensor Index
+    // 14 -> Cell Bias Tensor Index
+    // 15 -> Output Gate Bias Tensor Index
+    // 16 -> Projection Weights Tensor Index
+    // 17 -> Projection Bias Tensor Index
+    // 18 -> Output State In Tensor Index
+    // 19 -> Cell State In Tensor Index
+    assert(init_param.input_count - 3 > 20);
+    OperandIndexSequence inputs;
+    for (uint32_t n = 0; n < 20; ++n)
+    {
+      inputs.append(OperandIndex{init_param.inputs[n]});
+    }
+
+    // 24 -> Input Layer Normalization Weights Tensor Index
+    // 25 -> Forget Layer Normalization Weights Tensor Index
+    // 26 -> Cell Layer Normalization Weights Tensor Index
+    // 27 -> Output Layer Normalization Weights Tensor Index
+    if (init_param.input_count > 24)
+    {
+      for (uint32_t n = 24; n < 28; ++n)
+      {
+        if (init_param.input_count > n)
+        {
+          inputs.append(OperandIndex{init_param.inputs[n]});
+        }
+      }
+    }
+
+    // Each output should be interpreted as follows:
+    //
+    // 0 -> Output Tensor Index -> 3
+    // 1 -> Output State Out Tensor Index
+    // 2 -> Cell State Out Tensor Index
+    const OperandIndex scratch_buffer_index;
+    OperandIndex output_state_index =
+        init_param.output_count >= 2 ? OperandIndex{init_param.outputs[1]} : OperandIndex();
+    OperandIndex cell_state_index =
+        init_param.output_count >= 3 ? OperandIndex{init_param.outputs[2]} : OperandIndex();
+    const OperandIndex output_index = OperandIndex{init_param.outputs[0]};
+    OperandIndexSequence outputs{scratch_buffer_index, output_state_index, cell_state_index,
+                                 output_index};
+
+    operation::LSTM::Param param;
+    const auto activation_index = OperandIndex{init_param.inputs[20]};
+    switch (operands.at(activation_index).asScalar<int32_t>())
+    {
+      case 0:
+        param.activation = Activation::NONE;
+        break;
+      case 1:
+        param.activation = Activation::RELU;
+        break;
+      case 2:
+        param.activation = Activation::RELU1;
+        break;
+      case 3:
+        param.activation = Activation::RELU6;
+        break;
+      case 4:
+        param.activation = Activation::TANH;
+        break;
+      case 6:
+        param.activation = Activation::SIGMOID;
+        break;
+      default:
+        throw std::runtime_error("Unsupported activation type");
+        break;
+    }
+    param.cell_threshold = operands.at(OperandIndex{init_param.inputs[21]}).asScalar<float>();
+    param.projection_threshold = operands.at(OperandIndex{init_param.inputs[22]}).asScalar<float>();
+    param.time_major = operands.at(OperandIndex{init_param.inputs[23]}).asScalar<bool>();
 
     return new operation::LSTM{inputs, outputs, param};
   };
@@ -1406,7 +1503,7 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_ABS_EX
   _map[ANEURALNETWORKS_ABS_EX] = _map[ANEURALNETWORKS_ABS];
 
-  _map[ANEURALNETWORKS_ARGMAX] = [](const OperationFactory::Param &init_param, Operands &operands) {
+  _map[ANEURALNETWORKS_ARGMAX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
 
     OperandIndexSequence outputs{init_param.outputs[0]};
@@ -1415,10 +1512,9 @@ OperationFactory::OperationFactory()
     //
     //  0 -> Input Tensor Index
     //  1 -> Axis Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
+    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
 
     operation::ArgMax::Param param;
-    param.axis = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<std::int32_t>();
     // NNAPI ARGMAX output type is always int32
     param.output_type = DataType::INT32;
 
@@ -1517,7 +1613,7 @@ OperationFactory::OperationFactory()
     assert(init_param.input_count == 3);
     assert(init_param.output_count >= 1); // At least one output tensor and axis
 
-    OperandIndexSequence inputs{init_param.inputs[0]};
+    OperandIndexSequence inputs{init_param.inputs[1], init_param.inputs[0]};
     OperandIndexSequence outputs;
     for (uint32_t n = 0; n < init_param.output_count; ++n)
     {
@@ -1525,7 +1621,6 @@ OperationFactory::OperationFactory()
     }
 
     operation::Split::Param param;
-    param.axis = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<std::int32_t>();
     param.num_splits = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<std::int32_t>();
 
     return new operation::Split{inputs, outputs, param};
diff --git a/runtime/onert/frontend/tflite/CMakeLists.txt b/runtime/onert/frontend/tflite/CMakeLists.txt
index fcadf5223..604a9e4cb 100644
--- a/runtime/onert/frontend/tflite/CMakeLists.txt
+++ b/runtime/onert/frontend/tflite/CMakeLists.txt
@@ -8,7 +8,7 @@ add_library(tflite_loader SHARED ${TFLITE_LOADER_SOURCES})
 
 target_include_directories(tflite_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
-target_link_libraries(tflite_loader PUBLIC onert_core)
+target_link_libraries(tflite_loader PRIVATE onert_core)
 target_link_libraries(tflite_loader PRIVATE base_loader nnfw_common nnfw_coverage)
 
 install(TARGETS tflite_loader DESTINATION lib)
diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc
index 7eef15717..fe4295ada 100644
--- a/runtime/onert/frontend/tflite/src/tflite_loader.cc
+++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc
@@ -62,7 +62,7 @@ struct LoaderDomain
   }
 };
 
-class TFLiteLoader final : public base_loader::BaseLoader<LoaderDomain, TFLiteLoader>
+class TFLiteLoader final : public base_loader::BaseLoader<LoaderDomain>
 {
 public:
   using BaseLoader::BaseLoader;
@@ -78,7 +78,8 @@ public:
     }
   }
 
-  std::unique_ptr<ir::Graph> loadSubgraph(const onert_tflite::SubGraph *tflite_subg)
+private:
+  std::unique_ptr<ir::Graph> loadSubgraph(const onert_tflite::SubGraph *tflite_subg) override
   {
     auto subg = std::make_unique<ir::Graph>();
     // Load tensors
diff --git a/runtime/onert/test/graph/Index.cc b/runtime/onert/test/graph/Index.cc
index 358e64c82..2d110e326 100644
--- a/runtime/onert/test/graph/Index.cc
+++ b/runtime/onert/test/graph/Index.cc
@@ -20,7 +20,7 @@
 
 using Index = ::onert::util::Index<uint32_t, struct TestTag>;
 
-TEST(Index, index_test)
+TEST(Index, neg_index_test)
 {
   Index idx1{1u};
   Index idx2{2u};
diff --git a/runtime/onert/test/graph/operand/IndexSet.cc b/runtime/onert/test/graph/operand/IndexSet.cc
index 6215e0d24..6ef425a2d 100644
--- a/runtime/onert/test/graph/operand/IndexSet.cc
+++ b/runtime/onert/test/graph/operand/IndexSet.cc
@@ -21,7 +21,7 @@
 using onert::ir::OperandIndex;
 using onert::ir::OperandIndexSequence;
 
-TEST(graph_OperandIndexSequence, append)
+TEST(graph_OperandIndexSequence, neg_append)
 {
   OperandIndexSequence iset{0, 2, 4, 8};
 
@@ -42,7 +42,7 @@ TEST(graph_OperandIndexSequence, append)
   ASSERT_FALSE(iset.contains(OperandIndex{11}));
 }
 
-TEST(graph_OperandIndexSequence, replace)
+TEST(graph_OperandIndexSequence, neg_replace)
 {
   OperandIndexSequence iset{0, 1, 2, 3};
 
diff --git a/runtime/onert/test/graph/operand/LayoutSet.cc b/runtime/onert/test/graph/operand/LayoutSet.cc
index e35bddd8b..ef965a41e 100644
--- a/runtime/onert/test/graph/operand/LayoutSet.cc
+++ b/runtime/onert/test/graph/operand/LayoutSet.cc
@@ -21,7 +21,22 @@
 using onert::ir::Layout;
 using onert::ir::LayoutSet;
 
-TEST(graph_operand_LayoutSet, layout_set_operators)
+TEST(graph_operand_LayoutSet, neg_add_remove)
+{
+  LayoutSet set{Layout::NCHW};
+  set.remove(Layout::NHWC);
+  ASSERT_EQ(set.size(), 1);
+  set.add(Layout::NHWC);
+  ASSERT_EQ(set.size(), 2);
+  set.remove(Layout::NHWC);
+  ASSERT_EQ(set.size(), 1);
+  set.remove(Layout::NCHW);
+  ASSERT_EQ(set.size(), 0);
+  set.remove(Layout::NCHW);
+  ASSERT_EQ(set.size(), 0);
+}
+
+TEST(graph_operand_LayoutSet, set_operators)
 {
   LayoutSet set1{Layout::NCHW};
   LayoutSet set2{Layout::NHWC};
diff --git a/runtime/onert/test/graph/operand/Set.cc b/runtime/onert/test/graph/operand/Set.cc
index 0d35b5581..ffee417b8 100644
--- a/runtime/onert/test/graph/operand/Set.cc
+++ b/runtime/onert/test/graph/operand/Set.cc
@@ -18,7 +18,7 @@
 
 #include "ir/Operands.h"
 
-TEST(graph_operand_Set, set_test)
+TEST(graph_operand_Set, neg_set_test)
 {
   onert::ir::Operands set;
 
diff --git a/runtime/onert/test/graph/operand/UseDef.cc b/runtime/onert/test/graph/operand/UseDef.cc
index cd2cdb739..a8686eb18 100644
--- a/runtime/onert/test/graph/operand/UseDef.cc
+++ b/runtime/onert/test/graph/operand/UseDef.cc
@@ -31,7 +31,7 @@ using Mock = onert_test::ir::SimpleMock;
 
 } // namespace
 
-TEST(graph_operand_usedef, usedef_test)
+TEST(graph_operand_usedef, neg_usedef_test)
 {
   onert::ir::Graph graph;
   onert::ir::verifier::DAGChecker verifier;
@@ -62,7 +62,7 @@ TEST(graph_operand_usedef, usedef_test)
 
   graph.finishBuilding();
 
-  ASSERT_EQ(verifier.verify(graph), true);
+  ASSERT_TRUE(verifier.verify(graph));
 
   // Check def
   ASSERT_EQ(graph.operands().at(operand_index1).getDef(), mocknode_index1);
diff --git a/runtime/onert/test/graph/operation/SetIO.cc b/runtime/onert/test/graph/operation/SetIO.cc
index 378c5b4b9..22068ff58 100644
--- a/runtime/onert/test/graph/operation/SetIO.cc
+++ b/runtime/onert/test/graph/operation/SetIO.cc
@@ -62,7 +62,7 @@ TEST(graph_operation_setIO, operation_setIO_conv)
   ASSERT_EQ(conv->getInputs().at(Index{0}).value(), 8);
 }
 
-TEST(graph_operation_setIO, operation_setIO_concat)
+TEST(graph_operation_setIO, neg_operation_setIO_concat)
 {
   onert::ir::Graph graph;
 
diff --git a/runtime/onert/test/graph/verifier/Verifier.cc b/runtime/onert/test/graph/verifier/Verifier.cc
index f8c7557e3..3bce2746c 100644
--- a/runtime/onert/test/graph/verifier/Verifier.cc
+++ b/runtime/onert/test/graph/verifier/Verifier.cc
@@ -45,5 +45,54 @@ TEST(Verifier, dag_checker)
 
   onert::ir::verifier::DAGChecker verifier;
 
-  ASSERT_EQ(verifier.verify(graph), true);
+  ASSERT_TRUE(verifier.verify(graph));
+}
+
+TEST(Verifier, neg_edge_consistency_checker_1)
+{
+  onert::ir::Graph graph;
+
+  onert::ir::Shape shape{3};
+  onert::ir::TypeInfo type{onert::ir::DataType::INT32};
+
+  auto operand1 = graph.addOperand(shape, type);
+  auto operand2 = graph.addOperand(shape, type);
+
+  graph.addInput(operand1);
+  graph.addOutput(operand2);
+
+  auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2});
+  auto op_ind = graph.addOperation(std::move(mock_op));
+
+  graph.finishBuilding();
+
+  graph.operands().at(operand1).removeUse(op_ind); // Manipulate the operand alone
+
+  onert::ir::verifier::EdgeConsistencyChecker verifier;
+  ASSERT_FALSE(verifier.verify(graph));
+}
+
+TEST(Verifier, neg_edge_consistency_checker_2)
+{
+  onert::ir::Graph graph;
+
+  onert::ir::Shape shape{3};
+  onert::ir::TypeInfo type{onert::ir::DataType::INT32};
+
+  auto operand1 = graph.addOperand(shape, type);
+  auto operand2 = graph.addOperand(shape, type);
+
+  graph.addInput(operand1);
+  graph.addOutput(operand2);
+
+  auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2});
+  auto mock_op_ptr = mock_op.get();
+  auto op_ind = graph.addOperation(std::move(mock_op));
+
+  graph.finishBuilding();
+
+  mock_op_ptr->setInputs({operand2}); // Manipulate the operation alone
+
+  onert::ir::verifier::EdgeConsistencyChecker verifier;
+  ASSERT_FALSE(verifier.verify(graph));
 }
diff --git a/runtime/onert/test/util/ShapeInference.cc b/runtime/onert/test/util/ShapeInference.cc
index aab33fab5..a5f0af5ee 100644
--- a/runtime/onert/test/util/ShapeInference.cc
+++ b/runtime/onert/test/util/ShapeInference.cc
@@ -34,7 +34,7 @@ TEST(ShapeInference, Elementwise)
   ASSERT_EQ(infered_out_shape.dim(3), 3);
 }
 
-TEST(ShapeInference, IncorrectElementwise)
+TEST(ShapeInference, neg_Elementwise)
 {
   Shape lhs_shape{1, 299, 299, 3};
   Shape rhs_shape{5, 3};
@@ -123,6 +123,18 @@ TEST(ShapeInference, Pool2DNodeExplicit)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 20);
 }
 
+TEST(ShapeInference, neg_Pool2DNode_InvalidStride)
+{
+  Shape in_shape{10, 6, 12, 20};
+  Stride stride{0, 7};
+  Padding padding{PaddingType::SAME};
+
+  operation::Pool2D::Param avg_pool_param{
+      operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE};
+  ASSERT_THROW(onert::shape_inference::inferPoolShape(in_shape, avg_pool_param),
+               std::runtime_error);
+}
+
 TEST(ShapeInference, Conv2D)
 {
   Shape in_shape{10, 6, 12, 20};
@@ -159,6 +171,17 @@ TEST(ShapeInference, Conv2D)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 30);
 }
 
+TEST(ShapeInference, neg_Conv2D_InvalidStride)
+{
+  Shape in_shape{10, 6, 12, 20};
+  Shape ker_shape{30, 3, 6, 20};
+
+  operation::Conv2D::Param param{Stride{0, 0}, Padding{PaddingType::VALID}, Activation::NONE,
+                                 Dilation{1, 1}};
+  ASSERT_THROW(onert::shape_inference::inferConv2DShape(in_shape, ker_shape, param),
+               std::runtime_error);
+}
+
 TEST(ShapeInference, DepthwiseConv2D)
 {
   Shape in_shape{10, 6, 12, 20};
@@ -195,6 +218,17 @@ TEST(ShapeInference, DepthwiseConv2D)
   ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 60);
 }
 
+TEST(ShapeInference, neg_DepthwiseConv2D_InvalidSride)
+{
+  Shape in_shape{10, 6, 12, 20};
+  Shape ker_shape{1, 3, 6, 60};
+
+  operation::DepthwiseConv2D::Param param{Stride{3, 0}, Padding{PaddingType::VALID}, 3,
+                                          Activation::NONE};
+  ASSERT_THROW(onert::shape_inference::inferDepthwiseConv2DShape(in_shape, ker_shape, param),
+               std::runtime_error);
+}
+
 TEST(ShapeInference, Concat)
 {
   {
@@ -328,7 +362,8 @@ TEST(ShapeInference, Transpose)
     // pre-conditions
     ASSERT_EQ(in_shape.rank(), perm.size());
     ASSERT_EQ(expected.rank(), perm.size());
-    auto inferred_out_shape = onert::shape_inference::inferTransposeShape(in_shape, perm);
+    auto inferred_out_shape =
+        onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size());
     // post-conditions
     ASSERT_EQ(inferred_out_shape.rank(), perm.size());
     for (int32_t dim = 0; dim < expected.rank(); dim++)
@@ -369,12 +404,141 @@ TEST(ShapeInference, neg_Transpose)
   {
     std::vector<int> perm = {2, 0, 1, 0};
     // int32_t rank = 3;
-    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm), std::runtime_error);
+    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()),
+                 std::runtime_error);
   }
   // Invalid parameter value
   {
     std::vector<int> perm = {2, 0, 3};
     // int32_t rank = 3;
-    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm), std::runtime_error);
+    ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()),
+                 std::runtime_error);
+  }
+}
+
+TEST(ShapeInference, Gather)
+{
+  auto check = [&](Shape &input, Shape &indices, Shape &expected, int32_t axis) {
+    int rank = input.rank();
+    auto actual = onert::shape_inference::inferGatherShape(input, indices, axis, rank);
+
+    ASSERT_EQ(actual.rank(), expected.rank());
+
+    for (int32_t dim = 0; dim < expected.rank(); dim++)
+      ASSERT_EQ(actual.dim(dim), expected.dim(dim));
+  };
+
+  // check for 2-D, 3-D, axis 0
+  {
+    Shape input{3, 4};
+    Shape indices{1, 1, 2};
+    int32_t axis = 0;
+    Shape expected{1, 1, 2, 4};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 2-D, 3-D, axis 1
+  {
+    Shape input{3, 4};
+    Shape indices{1, 2, 1};
+    int32_t axis = 1;
+    Shape expected{3, 1, 2, 1};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 3-D, 2-D, axis 0
+  {
+    Shape input{2, 3, 4};
+    Shape indices{1, 2};
+    int32_t axis = 0;
+    Shape expected{1, 2, 3, 4};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 3-D, 2-D, axis 2
+  {
+    Shape input{2, 3, 4};
+    Shape indices{2, 1};
+    int32_t axis = 2;
+    Shape expected{2, 3, 2, 1};
+    check(input, indices, expected, axis);
+  }
+
+  // check for 4D, axis 0
+  {
+    Shape input{1, 2, 3, 4};
+    Shape indices{2};
+    int32_t axis = 0;
+    Shape expected{2, 2, 3, 4};
+    check(input, indices, expected, axis);
+  }
+}
+
+TEST(ShapeInference, BCQFullyConnected)
+{
+  auto check = [&](Shape &in_shape, Shape &cluster_shape, std::vector<int> cluster,
+                   Shape &expected) {
+    auto actual = onert::shape_inference::inferBCQFullyConnectedShape(in_shape, cluster_shape,
+                                                                      cluster.data());
+    ASSERT_EQ(actual.rank(), expected.rank());
+
+    for (int32_t dim = 0; dim < expected.rank(); dim++)
+      ASSERT_EQ(actual.dim(dim), expected.dim(dim));
+  };
+
+  {
+    Shape in_shape{10, 1};
+    Shape cluster_shape{3, 2};
+    std::vector<int> cluster = {1, 10, 2, 10, 3, 10};
+
+    Shape expected{30, 1};
+    check(in_shape, cluster_shape, cluster, expected);
+  }
+
+  {
+    Shape in_shape{1, 1};
+    Shape cluster_shape{1, 2};
+    std::vector<int> cluster = {3, 50};
+
+    Shape expected{50, 1};
+    check(in_shape, cluster_shape, cluster, expected);
+  }
+}
+
+TEST(ShapeInference, BCQGather)
+{
+  auto check = [&](Shape &indices_shape, Shape &cluster_shape, std::vector<int> cluster,
+                   uint32_t hidden_size, uint32_t axis, int rank, Shape &expected) {
+    operation::BCQGather::Param param{hidden_size, axis};
+    auto actual = onert::shape_inference::inferBCQGatherShape(indices_shape, cluster_shape,
+                                                              cluster.data(), rank, param);
+    ASSERT_EQ(actual.rank(), expected.rank());
+
+    for (int32_t dim = 0; dim < expected.rank(); dim++)
+      ASSERT_EQ(actual.dim(dim), expected.dim(dim));
+  };
+
+  {
+    Shape indices_shape{5, 1};
+    Shape cluster_shape{3, 2};
+    std::vector<int> cluster = {1, 10, 2, 10, 3, 10};
+    uint32_t hidden_size = 10;
+    uint32_t axis = 0;
+    int rank = 2;
+
+    Shape expected{5, 1, 10};
+    check(indices_shape, cluster_shape, cluster, hidden_size, axis, rank, expected);
+  }
+
+  {
+    Shape indices_shape{5, 1};
+    Shape cluster_shape{3, 2};
+    std::vector<int> cluster = {1, 10, 2, 10, 3, 10};
+    uint32_t hidden_size = 10;
+    uint32_t axis = 1;
+    int rank = 2;
+
+    Shape expected{30, 5, 1};
+    check(indices_shape, cluster_shape, cluster, hidden_size, axis, rank, expected);
   }
 }
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-10-28 12:16:55 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-10-28 12:16:55 +0900
commit	c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (patch)
tree	761ee8e171e5203f5c598ad93b2e7e0bc2e31aa2 /runtime
parent	74476a2d0296bdad70a2f7f90bc7419a8b05bffd (diff)
download	nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.gz nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.bz2 nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.zip