22 files changed, 742 insertions, 303 deletions
diff --git a/.gitignore b/.gitignore
index 0eaad0dc7..e7a60c0b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,4 @@ GRTAGS
 /externals/neon_2_sse
 /externals/tensorflow
 /externals/acl
+/externals/absl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6238ce073..85a81dd94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ link_directories(${CMAKE_INSTALL_PREFIX}/lib)
 
 # Download configuration
 option(DOWNLOAD_TENSORFLOW "Download Tensorflow source" ON)
+option(DOWNLOAD_ABSL "Download Absl source" ON)
 option(DOWNLOAD_EIGEN "Download Eigen source" ON)
 option(DOWNLOAD_FARMHASH "Download farmhash source" ON)
 option(DOWNLOAD_GEMMLOWP "Download GEMM low precesion library source" ON)
@@ -49,6 +50,11 @@ option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" ON)
 option(BUILD_GTEST "Download and build Google Test" ON)
 nnfw_find_package(GTest QUIET)
 
+# NOTE Workaround to avoid build fail by tensorflow (or acl) package version mismatch on obs build
+if(OBS_BUILD)
+  add_definitions(-DOBS_BUILD)
+endif(OBS_BUILD)
+
 # TODO For now Android build is being enabled incrementally so not all subdirectories are added yet.
 #      However we are going to have the same subdirectories with other OS eventually.
 if("${TARGET_OS}" STREQUAL "android")
diff --git a/cmake/option/option_armv7l-linux.cmake b/cmake/option/option_armv7l-linux.cmake
index 42988bc9d..b295b4a82 100644
--- a/cmake/option/option_armv7l-linux.cmake
+++ b/cmake/option/option_armv7l-linux.cmake
@@ -18,6 +18,7 @@ set(FLAGS_COMMON ${FLAGS_COMMON}
     "-mfpu=neon-vfpv4"
     "-funsafe-math-optimizations"
     "-ftree-vectorize"
+    "-fPIC"
     )
 
 # remove warning from arm cl
diff --git a/cmake/packages/AbslSourceConfig.cmake b/cmake/packages/AbslSourceConfig.cmake
new file mode 100644
index 000000000..9075b7397
--- /dev/null
+++ b/cmake/packages/AbslSourceConfig.cmake
@@ -0,0 +1,19 @@
+function(_AbslSource_import)
+  if(NOT DOWNLOAD_ABSL)
+    set(AbslSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_ABSL)
+
+  nnfw_include(ExternalSourceTools)
+  nnfw_include(OptionTools)
+
+  # NOTE The following URL comes from TensorFlow 1.12
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+  set(ABSL_URL ${EXTERNAL_DOWNLOAD_SERVER}/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz)
+  ExternalSource_Download("absl" ${ABSL_URL})
+
+  set(AbslSource_DIR ${absl_SOURCE_DIR} PARENT_SCOPE)
+  set(AbslSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_AbslSource_import)
+
+_AbslSource_import()
diff --git a/cmake/packages/EigenSourceConfig.cmake b/cmake/packages/EigenSourceConfig.cmake
index aac30dff5..dd94e069e 100644
--- a/cmake/packages/EigenSourceConfig.cmake
+++ b/cmake/packages/EigenSourceConfig.cmake
@@ -7,7 +7,7 @@ function(_EigenSource_import)
   nnfw_include(ExternalSourceTools)
   nnfw_include(OptionTools)
 
-  # NOTE The following URL comes from TensorFlow 1.9
+  # NOTE TensorFlow 1.12 downloads Eign from the following URL
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://bitbucket.org")
   set(EIGEN_URL ${EXTERNAL_DOWNLOAD_SERVER}/eigen/eigen/get/fd6845384b86.tar.gz)
   ExternalSource_Download("eigen" ${EIGEN_URL})
diff --git a/cmake/packages/FarmhashSourceConfig.cmake b/cmake/packages/FarmhashSourceConfig.cmake
index 29bc7f213..802367968 100644
--- a/cmake/packages/FarmhashSourceConfig.cmake
+++ b/cmake/packages/FarmhashSourceConfig.cmake
@@ -7,7 +7,7 @@ function(_FarmhashSource_import)
   nnfw_include(ExternalSourceTools)
   nnfw_include(OptionTools)
 
-  # NOTE TensorFlow 1.9 downloads farmhash from the following URL
+  # NOTE TensorFlow 1.12 downloads farmhash from the following URL
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
   set(FARMHASH_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz)
   ExternalSource_Download("farmhash" ${FARMHASH_URL})
diff --git a/cmake/packages/FlatBuffersConfig.cmake b/cmake/packages/FlatBuffersConfig.cmake
index fab08fe39..064673158 100644
--- a/cmake/packages/FlatBuffersConfig.cmake
+++ b/cmake/packages/FlatBuffersConfig.cmake
@@ -15,18 +15,22 @@ function(_FlatBuffers_import)
 
   # From FlatBuffers's CMakeLists.txt
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_cpp.cpp")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_dart.cpp")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_fbs.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_general.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_go.cpp")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_grpc.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_js.cpp")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_json_schema.cpp")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_lobster.cpp")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_lua.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_php.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_python.cpp")
-  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_fbs.cpp")
-  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_grpc.cpp")
-  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_json_schema.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/flatc.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/flatc_main.cpp")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/grpc/src/compiler/cpp_generator.cc")
   list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/grpc/src/compiler/go_generator.cc")
+  list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/grpc/src/compiler/java_generator.cc")
 
   if(NOT TARGET flatbuffers)
     add_library(flatbuffers ${FlatBuffers_Library_SRCS})
diff --git a/cmake/packages/FlatBuffersSourceConfig.cmake b/cmake/packages/FlatBuffersSourceConfig.cmake
index f062c2608..5f142bff7 100644
--- a/cmake/packages/FlatBuffersSourceConfig.cmake
+++ b/cmake/packages/FlatBuffersSourceConfig.cmake
@@ -7,9 +7,9 @@ function(_FlatBuffersSource_import)
   nnfw_include(ExternalSourceTools)
   nnfw_include(OptionTools)
 
-  # NOTE TensorFlow 1.9 downloads FlatBuffers from the following URL
+  # NOTE TensorFlow 1.12 downloads FlatBuffers from the following URL
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(FLATBUFFERS_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz)
+  set(FLATBUFFERS_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz)
   ExternalSource_Download("flatbuffers" ${FLATBUFFERS_URL})
 
   set(FlatBuffersSource_DIR ${flatbuffers_SOURCE_DIR} PARENT_SCOPE)
diff --git a/cmake/packages/GEMMLowpSourceConfig.cmake b/cmake/packages/GEMMLowpSourceConfig.cmake
index f5e73355e..613ff29b5 100644
--- a/cmake/packages/GEMMLowpSourceConfig.cmake
+++ b/cmake/packages/GEMMLowpSourceConfig.cmake
@@ -7,7 +7,7 @@ function(_GEMMLowpSource_import)
   nnfw_include(ExternalSourceTools)
   nnfw_include(OptionTools)
 
-  # NOTE TensorFlow 1.9 uses the following URL
+  # NOTE TensorFlow 1.12 uses the following URL
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
   set(GEMMLOWP_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.tar.gz)
   ExternalSource_Download("gemmlowp" ${GEMMLOWP_URL})
diff --git a/cmake/packages/NEON2SSESourceConfig.cmake b/cmake/packages/NEON2SSESourceConfig.cmake
index 6258b536d..b656f5700 100644
--- a/cmake/packages/NEON2SSESourceConfig.cmake
+++ b/cmake/packages/NEON2SSESourceConfig.cmake
@@ -7,7 +7,7 @@ function(_NEON2SSESource_import)
   nnfw_include(ExternalSourceTools)
   nnfw_include(OptionTools)
 
-  # NOTE TensorFlow 1.9 downloads NEON2SSE from the following URL
+  # NOTE TensorFlow 1.12 downloads NEON2SSE from the following URL
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
   set(NEON2SSE_URL ${EXTERNAL_DOWNLOAD_SERVER}/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz)
   ExternalSource_Download("neon_2_sse" ${NEON2SSE_URL})
diff --git a/cmake/packages/TensorFlowSourceConfig.cmake b/cmake/packages/TensorFlowSourceConfig.cmake
index ee88f9335..5828334c7 100644
--- a/cmake/packages/TensorFlowSourceConfig.cmake
+++ b/cmake/packages/TensorFlowSourceConfig.cmake
@@ -8,7 +8,7 @@ function(_TensorFlowSource_import)
   nnfw_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(TENSORFLOW_URL ${EXTERNAL_DOWNLOAD_SERVER}/tensorflow/tensorflow/archive/v1.9.0.tar.gz)
+  set(TENSORFLOW_URL ${EXTERNAL_DOWNLOAD_SERVER}/tensorflow/tensorflow/archive/v1.12.0.tar.gz)
   ExternalSource_Download("tensorflow" ${TENSORFLOW_URL})
 
   set(TensorFlowSource_DIR ${tensorflow_SOURCE_DIR} PARENT_SCOPE)
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 5bfc4ad4e..c6f5b5b8b 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -2,6 +2,7 @@ set(TENSORFLOW_BASE ${CMAKE_CURRENT_SOURCE_DIR}/tensorflow)
 set(TENSORFLOW_LITE_BASE ${TENSORFLOW_BASE}/tensorflow/contrib/lite)
 
 # Required source & package
+nnfw_find_package(AbslSource REQUIRED)
 nnfw_find_package(Eigen REQUIRED)
 nnfw_find_package(FarmhashSource REQUIRED)
 nnfw_find_package(FlatBuffersSource REQUIRED)
@@ -19,10 +20,20 @@ file(GLOB_RECURSE TFLITE_KERNEL_SRCS "${TENSORFLOW_LITE_BASE}/kernels/*.cc")
 file(GLOB_RECURSE TFLITE_KERNEL_TESTS "${TENSORFLOW_LITE_BASE}/kernels/*test*.cc")
 list(REMOVE_ITEM TFLITE_KERNEL_SRCS ${TFLITE_KERNEL_TESTS})
 
+file(GLOB TFLITE_LIB_SRCS "${TENSORFLOW_LITE_BASE}/c/*.c" "${TENSORFLOW_LITE_BASE}/c/*.cc")
+file(GLOB TFLITE_LIB_TESTS "${TENSORFLOW_LITE_BASE}/c/*test*.cc")
+list(REMOVE_ITEM TFLITE_LIB_SRCS ${TFLITE_LIB_TESTS})
+
+file(GLOB TFLITE_API_SRCS "${TENSORFLOW_LITE_BASE}/core/api/*.c" "${TENSORFLOW_LITE_BASE}/core/api/*.cc")
+file(GLOB TFLITE_API_TESTS "${TENSORFLOW_LITE_BASE}/core/api/*test*.cc")
+list(REMOVE_ITEM TFLITE_API_SRCS ${TFLITE_API_TESTS})
+
 # We will use our own BuiltinOpResolver
 list(REMOVE_ITEM TFLITE_KERNEL_SRCS "${TENSORFLOW_LITE_BASE}/kernels/register.cc")
 list(APPEND TFLITE_SRCS ${TFLITE_CORE_SRCS})
 list(APPEND TFLITE_SRCS ${TFLITE_KERNEL_SRCS})
+list(APPEND TFLITE_SRCS ${TFLITE_LIB_SRCS})
+list(APPEND TFLITE_SRCS ${TFLITE_API_SRCS})
 
 list(APPEND TFLITE_SRCS "${TFLITE_DEPEND_DIR}/farmhash/src/farmhash.cc")
 
@@ -36,6 +47,7 @@ if(BUILD_TFLITE_BENCHMARK_MODEL)
 endif()
 
 list(APPEND TFLITE_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/tensorflow")
+list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/absl")
 list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/gemmlowp")
 list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/farmhash/src")
 list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/flatbuffers/include")
diff --git a/include/NeuralNetworksShim.h b/include/NeuralNetworksShim.h
index a7bd745fb..b310a44cd 100644
--- a/include/NeuralNetworksShim.h
+++ b/include/NeuralNetworksShim.h
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// NOTE This header is derived from the following file (in TensorFlow)
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
 //       'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h'
 #ifndef __NEURAL_NETWORKS_SHIM__
 #define __NEURAL_NETWORKS_SHIM__
@@ -68,6 +68,9 @@ typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
     ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
     uint32_t outputCount, const uint32_t* outputs);
 
+typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
+    ANeuralNetworksModel* model, bool allow);
+
 typedef int (*ANeuralNetworksExecution_create_fn)(
     ANeuralNetworksCompilation* compilation,
     ANeuralNetworksExecution** execution);
@@ -360,6 +363,34 @@ inline int ANeuralNetworksModel_identifyInputsAndOutputs(
 }
 
 /**
+ * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
+ * calculated with range and/or precision as low as that of the IEEE 754 16-bit
+ * floating-point format. By default, {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+ * must be calculated using at least the range and precision of the IEEE 754
+ * 32-bit floating-point format.
+ *
+ * @param model The model to be modified.
+ * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
+ *              calculated with range and/or precision as low as that of the
+ *              IEEE 754 16-bit floating point format. 'false' indicates
+ *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using
+ *              at least the range and precision of the IEEE 754 32-bit floating
+ *              point format.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * Available since API level 28.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ */
+inline int ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+    ANeuralNetworksModel* model, bool allow) {
+  LOAD_FUNCTION(ANeuralNetworksModel_relaxComputationFloat32toFloat16);
+  EXECUTE_FUNCTION_RETURN(model, allow);
+}
+
+/**
  * Create a {@link ANeuralNetworksCompilation} to compile the given model.
  * This only creates the object. Compilation is only performed once
  * {@link ANeuralNetworksCompilation_start} is invoked.
diff --git a/include/support/tflite/nnapi_delegate.h b/include/support/tflite/nnapi_delegate.h
index a5da8ac39..b396d77f2 100644
--- a/include/support/tflite/nnapi_delegate.h
+++ b/include/support/tflite/nnapi_delegate.h
@@ -17,18 +17,24 @@ limitations under the License.
 // NOTE To minimize diff with upstream tensorflow, disable clang-format
 // clang-format off
 
-// NOTE This header is derived from the following file (in TensorFlow)
+// NOTE This header is derived from the following file (in TensorFlow v1.12)
 //        'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.h'
 #ifndef __NNFW_SUPPORT_TFLITE_NNAPI_DELEGATE_H__
 #define __NNFW_SUPPORT_TFLITE_NNAPI_DELEGATE_H__
 
 #include "tensorflow/contrib/lite/allocation.h"
+#ifdef OBS_BUILD
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
+#else
+#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#endif
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "NeuralNetworksShim.h"
 
 class ANeuralNetworksModel;
+class ANeuralNetworksMemory;
 class ANeuralNetworksCompilation;
 
 namespace nnfw {
@@ -62,11 +68,16 @@ class NNAPIDelegate {
   // Run
   TfLiteStatus Invoke(::tflite::Interpreter* interpreter);
 
+  // Whether the current platform supports NNAPI delegation.
+  static bool IsSupported();
+
  private:
   // The NN API model handle
   ANeuralNetworksModel* nn_model_ = nullptr;
   // The NN API compilation handle
   ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
+  // Model status
+  TfLiteStatus model_status_ = kTfLiteOk;
 
   // List of state tensors for LSTM, RNN, SVDF.
   // NN API does not allow ops to maintain states across multiple
diff --git a/libs/support/tflite/src/nnapi_delegate.cpp b/libs/support/tflite/src/nnapi_delegate.cpp
index 987cd77c5..57322f531 100644
--- a/libs/support/tflite/src/nnapi_delegate.cpp
+++ b/libs/support/tflite/src/nnapi_delegate.cpp
@@ -17,47 +17,85 @@ limitations under the License.
 // NOTE To minimize diff with upstream tensorflow, disable clang-format
 // clang-format off
 
-// NOTE This code is derived from the following file (in TensorFlow)
+// NOTE This code is derived from the following file (in TensorFlow v1.11)
 //        'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc'
 #include "support/tflite/nnapi_delegate.h"
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#ifdef OBS_BUILD
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
+#else
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#endif
 #include "tensorflow/contrib/lite/model.h"
 #include "NeuralNetworksShim.h"
 #include "NeuralNetworksExShim.h"
 
 #ifdef __ANDROID__
+#include <android/log.h>
 #include <sys/system_properties.h>
 #endif
 
-namespace nnfw
-{
+namespace nnfw {
 
-// TODO(aselle): FATAL leaves resources hanging.
-void FATAL(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  vfprintf(stderr, format, args);
-  va_end(args);
+void logError(const char* format, ...) {
+  // stderr is convenient for native tests, but is not captured for apps
+  va_list args_for_stderr;
+  va_start(args_for_stderr, format);
+  vfprintf(stderr, format, args_for_stderr);
+  va_end(args_for_stderr);
+  fprintf(stderr, "\n");
   fflush(stderr);
-  exit(1);
+#ifdef __ANDROID__
+  // produce logcat output for general consumption
+  va_list args_for_log;
+  va_start(args_for_log, format);
+  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
+  va_end(args_for_log);
+#endif
 }
 
+#define FATAL(...)       \
+  logError(__VA_ARGS__); \
+  exit(1);
+
 // TODO(aselle): Change the error model to use status codes.
-#define CHECK_TFLITE_SUCCESS(x)                       \
-  if (x != kTfLiteOk) {                               \
-    FATAL("Aborting since tflite returned failure."); \
+#define CHECK_TFLITE_SUCCESS(x)                                           \
+  if (x != kTfLiteOk) {                                                   \
+    FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \
+          __LINE__);                                                      \
   }
 
-#define CHECK_NN(x)                                   \
-  if (x != ANEURALNETWORKS_NO_ERROR) {                \
-    FATAL("Aborting since tflite returned failure."); \
+#define CHECK_NN(x)                                                     \
+  if (x != ANEURALNETWORKS_NO_ERROR) {                                  \
+    FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \
+          __LINE__);                                                    \
   }
 
+#define RETURN_ERROR_IF_TFLITE_FAILED(x)                                       \
+  if (x != kTfLiteOk) {                                                        \
+    logError(                                                                  \
+        "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \
+        __LINE__);                                                             \
+    return kTfLiteError;                                                       \
+  }
+
+#define RETURN_ERROR_IF_NN_FAILED(x)                                          \
+  if (x != ANEURALNETWORKS_NO_ERROR) {                                        \
+    logError(                                                                 \
+        "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \
+        __LINE__);                                                            \
+    return kTfLiteError;                                                      \
+  }
+
+// Tracking of NNAPI operand ids
+static const int64_t kOperandIdNotSet = -1;
+static const int64_t kOperandNotNeeded = -2;
+
 namespace {
 
 int32_t GetAndroidSdkVersion() {
@@ -80,7 +118,10 @@ int32_t GetAndroidSdkVersion() {
   return 0;
 }
 
-static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+int32_t GetAndroidSdkVersionCached() {
+  static int32_t androidSdkVersion = GetAndroidSdkVersion();
+  return androidSdkVersion;
+}
 
 }  // namespace
 
@@ -112,21 +153,16 @@ NNAPIDelegate::~NNAPIDelegate() {
 }
 
 // Adds the tensors of the interpreter to the NN API model.
-// Returns the number of operands added.
-uint32_t addTensorOperands(tflite::Interpreter* interpreter,
+TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
                                ANeuralNetworksModel* nn_model,
-                               const std::vector<uint32_t>& skip_list) {
+                               uint32_t* no_of_operands_added,
+                               std::vector<int64_t>* nnapi_ids) {
   uint32_t next_id = 0;
   for (size_t i = 0; i < interpreter->tensors_size(); i++) {
-    // skip temporaries tensors.
-    bool shouldSkip = false;
-    for (auto skip_idx : skip_list) {
-      if (i == skip_idx) {
-        shouldSkip = true;
-        break;
-      }
-    }
-    if (shouldSkip) continue;
+    // Skip temporaries and RNN back-edges.
+    if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
+
+    (*nnapi_ids)[i] = int64_t(next_id);
 
     int32_t nn_type = 0;
     // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
@@ -160,7 +196,28 @@ uint32_t addTensorOperands(tflite::Interpreter* interpreter,
         zeroPoint = tensor->params.zero_point;
         break;
       default:
-        FATAL("Unsupported type.");
+        logError("Unsupported tensor type %d", tensor->type);
+        return kTfLiteError;
+    }
+    if (tensor->dims->size == 0) {
+      // WORKAROUND Some model have dimension zero
+      switch (tensor->type) {
+        case kTfLiteFloat32:
+          nn_type = ANEURALNETWORKS_FLOAT32;
+          break;
+        case kTfLiteInt32:
+          nn_type = ANEURALNETWORKS_INT32;
+          break;
+        default:
+          logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)",
+                   i, tensor->name);
+          return kTfLiteError;
+      }
+    }
+    if (tensor->dims->size > 4) {
+      logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)",
+               i, tensor->name);
+      return kTfLiteError;
     }
     // TODO(aselle): Note, many of these are intermediate results. Do I need
     // to ever specify these sizes. I am currently below doing setValue
@@ -170,36 +227,53 @@ uint32_t addTensorOperands(tflite::Interpreter* interpreter,
     ANeuralNetworksOperandType operand_type{
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+    RETURN_ERROR_IF_NN_FAILED(
+        ANeuralNetworksModel_addOperand(nn_model, &operand_type));
     // TODO(aselle): Based on Michael's suggestion, limiting this to read
     // only memory
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
-              static_cast<const ::tflite::Allocation*>(tensor->allocation))) {
-        CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory(
-            nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw),
-            tensor->bytes));
+              static_cast<const tflite::Allocation*>(tensor->allocation))) {
+        RETURN_ERROR_IF_NN_FAILED(
+            ANeuralNetworksModel_setOperandValueFromMemory(
+                nn_model, next_id, alloc->memory(),
+                alloc->offset(tensor->data.raw), tensor->bytes));
       } else {
-        CHECK_NN(ANeuralNetworksModel_setOperandValue(
+        RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
             nn_model, next_id, tensor->data.raw, tensor->bytes));
       }
     } else if (tensor->bytes == 0) {
       // These size 0 tensors are optional tensors reserved.
-      CHECK_NN(
+      RETURN_ERROR_IF_NN_FAILED(
           ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
     }
 
     ++next_id;
   }
-  return next_id;
+  *no_of_operands_added = next_id;
+  return kTfLiteOk;
+}
+
+void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
+                        std::vector<uint32_t>* into,
+                        const std::vector<int64_t>& map) {
+  for (size_t i = 0; i < from_ids_count; i++) {
+    int from_id = from_ids_buf[i];
+    if (from_id == kOptionalTensor) {
+      into->push_back(from_id);
+    } else {
+      into->push_back(map[from_id]);
+    }
+  }
 }
 
 // Adds the operations and their parameters to the NN API model.
 // 'next-id' is the operand ID of the next operand of the model.
-void AddOpsAndParams(tflite::Interpreter* interpreter,
-                     ANeuralNetworksModel* nn_model, uint32_t next_id,
-                     std::vector<int>* model_state_inputs,
-                     std::vector<int>* model_state_outputs) {
+TfLiteStatus AddOpsAndParams(
+    tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model,
+    uint32_t next_id, std::vector<int>* model_state_inputs,
+    std::vector<int>* model_state_outputs,
+    const std::vector<int64_t>& tensor_id_to_nnapi_id) {
   for (size_t i = 0; i < interpreter->nodes_size(); i++) {
     const auto* node_and_registration = interpreter->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
@@ -208,10 +282,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         static_cast<tflite::BuiltinOperator>(registration.builtin_code);
 
     // Add the parameters.
-    std::vector<uint32_t> augmented_inputs(
-        node.inputs->data, node.inputs->data + node.inputs->size);
-    std::vector<uint32_t> augmented_outputs(
-        node.outputs->data, node.outputs->data + node.outputs->size);
+    std::vector<uint32_t> augmented_inputs, augmented_outputs;
+    MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs,
+                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(node.outputs->data, node.outputs->size,
+                       &augmented_outputs, tensor_id_to_nnapi_id);
 
     auto add_scalar_int32 = [&nn_model, &augmented_inputs,
                              &next_id](int value) {
@@ -260,47 +335,68 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
           model_state_outputs->push_back(tensor_id);
           next_id++;
         };
+    auto check_and_add_activation = [&add_scalar_int32](int activation) {
+      if (activation > kTfLiteActRelu6) {
+        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
+        return kTfLiteError;
+      }
+      add_scalar_int32(activation);
+      return kTfLiteOk;
+    };
 
-    auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
+    auto add_add_params = [&add_scalar_int32](void* data) {
+      auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
+      if (builtin->activation > kTfLiteActRelu6) {
+        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
+        return kTfLiteError;
+      }
+      add_scalar_int32(builtin->activation);
+      return kTfLiteOk;
+    };
 
-    auto add_pooling_params = [&add_scalar_int32](void* data) {
+    auto add_pooling_params = [&add_scalar_int32,
+                               &check_and_add_activation](void* data) {
       auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
       add_scalar_int32(builtin->padding);
       add_scalar_int32(builtin->stride_width);
       add_scalar_int32(builtin->stride_height);
       add_scalar_int32(builtin->filter_width);
       add_scalar_int32(builtin->filter_height);
-      add_scalar_int32(builtin->activation);
+      return check_and_add_activation(builtin->activation);
     };
 
-    auto add_convolution_params = [&add_scalar_int32](void* data) {
+    auto add_convolution_params = [&add_scalar_int32,
+                                   &check_and_add_activation](void* data) {
       auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
       add_scalar_int32(builtin->padding);
       add_scalar_int32(builtin->stride_width);
       add_scalar_int32(builtin->stride_height);
-      add_scalar_int32(builtin->activation);
+      return check_and_add_activation(builtin->activation);
     };
 
-    auto add_depthwise_conv_params = [&add_scalar_int32](void* data) {
+    auto add_depthwise_conv_params = [&add_scalar_int32,
+                                      &check_and_add_activation](void* data) {
       auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
       add_scalar_int32(builtin->padding);
       add_scalar_int32(builtin->stride_width);
       add_scalar_int32(builtin->stride_height);
       add_scalar_int32(builtin->depth_multiplier);
-      add_scalar_int32(builtin->activation);
+      return check_and_add_activation(builtin->activation);
     };
 
-    auto add_fully_connected_params = [&add_scalar_int32](void* data) {
+    auto add_fully_connected_params = [&check_and_add_activation](void* data) {
       auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
-      add_scalar_int32(builtin->activation);
+      return check_and_add_activation(builtin->activation);
     };
 
     auto add_concatenation_params = [&add_scalar_int32](void* data) {
       auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
       add_scalar_int32(builtin->axis);
       if (builtin->activation != kTfLiteActNone) {
-        FATAL("Concatenation does not support fused activation in NNAPI");
+        logError("Concatenation does not support fused activation in NNAPI");
+        return kTfLiteError;
       }
+      return kTfLiteOk;
     };
 
     auto add_softmax_params = [&add_scalar_float32](void* data) {
@@ -324,6 +420,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     // LSTM in NNAPI requires scratch tensor as an output operand.
     auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
+      if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
       const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
       ANeuralNetworksOperandType operand_type{
@@ -336,7 +433,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     };
 
     auto add_mean_params = [&add_scalar_int32](void* data) {
+#ifdef OBS_BUILD
       auto builtin = reinterpret_cast<TfLiteMeanParams*>(data);
+#else
+      auto builtin = reinterpret_cast<TfLiteReducerParams*>(data);
+#endif
       add_scalar_int32(builtin->keep_dims);
     };
 
@@ -351,7 +452,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       add_scalar_int32(builtin->activation);
     };
 
-    auto add_squeeze_params = [&add_vector_int32](void* data) {
+    auto add_squeeze_params = [&](void* data) {
       const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data);
       // Note that we add the squeeze dimensions even if the dimensions were
       // unspecified (empty), as NNAPI requires the operand.
@@ -382,26 +483,34 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     switch (builtin) {
       case tflite::BuiltinOperator_ADD:
         nn_op_type = ANEURALNETWORKS_ADD;
-        add_add_params();
+        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
         break;
       case tflite::BuiltinOperator_MUL:
         nn_op_type = ANEURALNETWORKS_MUL;
-        add_add_params();
+        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
         break;
       case tflite::BuiltinOperator_AVERAGE_POOL_2D:
-        add_pooling_params(node.builtin_data);
+        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
         break;
       case tflite::BuiltinOperator_MAX_POOL_2D:
-        add_pooling_params(node.builtin_data);
+        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
         break;
       case tflite::BuiltinOperator_L2_POOL_2D:
-        add_pooling_params(node.builtin_data);
+        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
         break;
-      case tflite::BuiltinOperator_CONV_2D:
-        add_convolution_params(node.builtin_data);
+      case tflite::BuiltinOperator_CONV_2D: {
+        auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data);
+        if (builtin->dilation_width_factor != 1 ||
+            builtin->dilation_height_factor != 1 || node.inputs->size != 3) {
+          logError("NNAPI does not support dilated Conv2D.");
+          return kTfLiteError;
+        }
+      }
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_convolution_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_CONV_2D;
         break;
       case tflite::BuiltinOperator_RELU:
@@ -423,11 +532,13 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         nn_op_type = ANEURALNETWORKS_LOGISTIC;
         break;
       case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
-        add_depthwise_conv_params(node.builtin_data);
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_depthwise_conv_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
         break;
       case tflite::BuiltinOperator_CONCATENATION:
-        add_concatenation_params(node.builtin_data);
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_concatenation_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_CONCATENATION;
         break;
       case tflite::BuiltinOperator_SOFTMAX:
@@ -435,10 +546,15 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         nn_op_type = ANEURALNETWORKS_SOFTMAX;
         break;
       case tflite::BuiltinOperator_FULLY_CONNECTED:
-        add_fully_connected_params(node.builtin_data);
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_fully_connected_params(node.builtin_data));
         nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
         break;
       case tflite::BuiltinOperator_RESHAPE:
+        if (node.inputs->size != 2) {
+          logError("NNAPI only supports 2-input RESHAPE");
+          return kTfLiteError;
+        }
         nn_op_type = ANEURALNETWORKS_RESHAPE;
         // add_reshape_params(node.builtin_data);
         break;
@@ -451,6 +567,10 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
         break;
       case tflite::BuiltinOperator_LSTM: {
+        if (node.inputs->size + /* no of params */ 3 != 21) {
+          logError("NNAPI only supports 21-input LSTMs");
+          return kTfLiteError;
+        }
         duplicate_state_tensor_float32(
             node.outputs->data[/*kOutputStateTensor*/ 0]);
         duplicate_state_tensor_float32(
@@ -489,10 +609,6 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         add_mean_params(node.builtin_data);
         nn_op_type = ANEURALNETWORKS_MEAN;
         break;
-      case tflite::BuiltinOperator_L2_NORMALIZATION:
-        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
-        check_l2normalization_params(node.builtin_data);
-        break;
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
         nn_op_type = ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
         add_lrn_params(node.builtin_data);
@@ -500,20 +616,54 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_DIV:
         nnapi_version = 11;  // require NNAPI 1.1
         nn_op_type = ANEURALNETWORKS_DIV;
-        add_add_params();
-        break;
-      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-        nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP;
+        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
+            reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation));
         break;
       case tflite::BuiltinOperator_SUB:
         nnapi_version = 11;  // require NNAPI 1.1
         nn_op_type = ANEURALNETWORKS_SUB;
-        add_add_params();
+        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
+            reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation));
         break;
       case tflite::BuiltinOperator_SQUEEZE:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_SQUEEZE;
+        nnapi_version = 11;  // requires NNAPI 1.1
         add_squeeze_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SQUEEZE;
+        break;
+      case tflite::BuiltinOperator_TRANSPOSE:
+        // The permutation input tensor value dictates the output dimensions.
+        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
+        if ((node.inputs->size > 1) &&
+            (interpreter->tensor(node.inputs->data[1])->allocation_type !=
+             kTfLiteMmapRo)) {
+          logError("NNAPI does not yet support dynamic tensors.");
+          return kTfLiteError;
+        }
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_TRANSPOSE;
+        break;
+      case tflite::BuiltinOperator_L2_NORMALIZATION:
+        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
+        if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
+                ->activation != kTfLiteActNone) {
+          logError(
+              "NNAPI does not support L2Normalization with fused activations");
+          return kTfLiteError;
+        }
+        if ((node.inputs->size > 0) &&
+            (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+          logError("NNAPI only supports input rank 4 for L2Normalization");
+          return kTfLiteError;
+        }
+        break;
+      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
+        if (interpreter->tensor(node.outputs->data[0])->type !=
+            kTfLiteFloat32) {
+          logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
+                   builtin);
+          return kTfLiteError;
+        }
+        nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP;
         break;
       case tflite::BuiltinOperator_STRIDED_SLICE:
         add_strided_slice_params(node.builtin_data);
@@ -557,10 +707,6 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
             augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
             reinterpret_cast<uint32_t*>(node.outputs->data)));
         continue;
-      case tflite::BuiltinOperator_TRANSPOSE:
-        nn_op_type = ANEURALNETWORKS_TRANSPOSE;
-        // param is almost same as reshape
-        break;
       case tflite::BuiltinOperator_NEG:
         CHECK_NN(ANeuralNetworksModel_addOperationEx(
             nn_model, ANEURALNETWORKS_NEG_EX,
@@ -582,28 +728,71 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
       case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+      //case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
       case tflite::BuiltinOperator_PADV2:
+      //case tflite::BuiltinOperator_RESIZE_BILINEAR:
       case tflite::BuiltinOperator_CALL:
       case tflite::BuiltinOperator_SKIP_GRAM:
+      //case tflite::BuiltinOperator_RELU_N1_TO_1:
+      //case tflite::BuiltinOperator_GATHER:
+      //case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
+      //case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
+      //case tflite::BuiltinOperator_TOPK_V2:
+      //case tflite::BuiltinOperator_SPLIT:
+      //case tflite::BuiltinOperator_STRIDED_SLICE:
+      //case tflite::BuiltinOperator_EXP:
       case tflite::BuiltinOperator_LOG_SOFTMAX:
+      //case tflite::BuiltinOperator_DEQUANTIZE:
       case tflite::BuiltinOperator_DELEGATE:
+      //case tflite::BuiltinOperator_CAST:
       case tflite::BuiltinOperator_PRELU:
       case tflite::BuiltinOperator_MAXIMUM:
       case tflite::BuiltinOperator_MINIMUM:
       case tflite::BuiltinOperator_ARG_MAX:
+#ifndef OBS_BUILD
+      case tflite::BuiltinOperator_ARG_MIN:
+#endif
       case tflite::BuiltinOperator_GREATER:
       case tflite::BuiltinOperator_GREATER_EQUAL:
       case tflite::BuiltinOperator_LESS:
       case tflite::BuiltinOperator_LESS_EQUAL:
+      //case tflite::BuiltinOperator_NEG:
       case tflite::BuiltinOperator_SELECT:
       case tflite::BuiltinOperator_SLICE:
       case tflite::BuiltinOperator_SIN:
+      //case tflite::BuiltinOperator_LOG:
       case tflite::BuiltinOperator_TRANSPOSE_CONV:
+#ifndef OBS_BUILD
+      case tflite::BuiltinOperator_TILE:
+      case tflite::BuiltinOperator_EXPAND_DIMS:
       case tflite::BuiltinOperator_SPARSE_TO_DENSE:
-        FATAL("Op code %d is currently not delegated to NNAPI", builtin);
-        nn_op_type = -1;  // set to invalid
+      case tflite::BuiltinOperator_EQUAL:
+      case tflite::BuiltinOperator_NOT_EQUAL:
+      case tflite::BuiltinOperator_SUM:
+      case tflite::BuiltinOperator_REDUCE_MAX:
+      case tflite::BuiltinOperator_REDUCE_MIN:
+      case tflite::BuiltinOperator_REDUCE_PROD:
+      case tflite::BuiltinOperator_SQRT:
+      case tflite::BuiltinOperator_RSQRT:
+      case tflite::BuiltinOperator_SHAPE:
+      case tflite::BuiltinOperator_POW:
+      case tflite::BuiltinOperator_FAKE_QUANT:
+      case tflite::BuiltinOperator_PACK:
+      case tflite::BuiltinOperator_LOGICAL_OR:
+      case tflite::BuiltinOperator_ONE_HOT:
+      case tflite::BuiltinOperator_LOGICAL_AND:
+      case tflite::BuiltinOperator_LOGICAL_NOT:
+      case tflite::BuiltinOperator_UNPACK:
+      case tflite::BuiltinOperator_FLOOR_DIV:
+      case tflite::BuiltinOperator_REDUCE_ANY:
+      case tflite::BuiltinOperator_SQUARE:
+      case tflite::BuiltinOperator_ZEROS_LIKE:
+      case tflite::BuiltinOperator_FILL:
+#endif
+        logError("Op code %d is currently not delegated to NNAPI", builtin);
+        return kTfLiteError;
         break;
-      case tflite::BuiltinOperator_CUSTOM:
+      case tflite::BuiltinOperator_CUSTOM: {
         std::string custom_name(registration.custom_name);
         if (custom_name.compare("TensorFlowMax") == 0) {
           CHECK_NN(ANeuralNetworksModel_addOperationEx(
@@ -650,64 +839,99 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
               reinterpret_cast<uint32_t*>(node.outputs->data)));
           continue;
         }
-
-        FATAL("Custom operations are not supported when using NNAPI.");
-        nn_op_type = -1;  // set to invalid
+        logError("Custom operations are not supported when using NNAPI.");
+        return kTfLiteError;
+        break;
+      }
+#ifdef OBS_BUILD
+      default:
+        logError("Op code %d is currently not delegated to NNAPI", builtin);
+        return kTfLiteError;
         break;
+#endif
     }
 
-    //if (nnapi_version == 11 && kAndroidSdkVersion < 28) {
+    //if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
     //  FATAL("Op %d needs NNAPI1.1", builtin);
     //}
 
     // Add the operation.
-    CHECK_NN(ANeuralNetworksModel_addOperation(
+    RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
         nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
         augmented_inputs.data(),
         static_cast<uint32_t>(augmented_outputs.size()),
         reinterpret_cast<uint32_t*>(augmented_outputs.data())));
   }
+  return kTfLiteOk;
 }
 
 TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) {
-  // TODO(aselle): This is not correct. need to handle resize invalidation.
-  if (nn_model_ && nn_compiled_model_) return kTfLiteOk;
+  if (nn_model_ && nn_compiled_model_) return model_status_;
 
+  // TODO(aselle): This is not correct. need to handle resize invalidation.
   if (!nn_model_) {
     CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
 
-    // Find all the temporary tensors and put them in a skip_list.
-    std::vector<uint32_t> skip_list;
+    // Find which tensors should be added to NNAPI. TFLite has temporaries
+    // and RNN back-edges which are are not valid for NNAPI. We look through all
+    // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
+    // kOperandIdNotSet. addTensorOperands will replace those with the
+    // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
+    std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(),
+                                               kOperandNotNeeded);
+    auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
+                                                       size_t count) {
+      for (int j = 0; j < count; j++) {
+        auto tensor_id = buf[j];
+        if (tensor_id != kOptionalTensor) {
+          tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet;
+        }
+      }
+    };
     for (size_t i = 0; i < interpreter->nodes_size(); i++) {
       const auto* node_and_registration = interpreter->node_and_registration(i);
       const TfLiteNode& node = node_and_registration->first;
-      if (node.temporaries != nullptr) {
-        for (int j = 0; j < node.temporaries->size; j++) {
-          skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j]));
-        }
-      }
+      set_ids_to_not_set(node.inputs->data, node.inputs->size);
+      set_ids_to_not_set(node.outputs->data, node.outputs->size);
     }
-
-    uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list);
-    AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
-                    &model_states_outputs_);
-
-    std::vector<int> augmented_inputs = interpreter->inputs();
-    std::vector<int> augmented_outputs = interpreter->outputs();
-
-    // All state tensors input/output need to be treated as model input/output.
+    set_ids_to_not_set(interpreter->inputs().data(),
+                       interpreter->inputs().size());
+    set_ids_to_not_set(interpreter->outputs().data(),
+                       interpreter->outputs().size());
+
+    uint32_t next_id = 0;
+    RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
+        interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id));
+    RETURN_ERROR_IF_TFLITE_FAILED(
+        AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+                        &model_states_outputs_, tensor_id_to_nnapi_id));
+
+    std::vector<uint32_t> augmented_inputs;
+    MapAndAddTensorIds(interpreter->inputs().data(),
+                       interpreter->inputs().size(), &augmented_inputs,
+                       tensor_id_to_nnapi_id);
     augmented_inputs.insert(augmented_inputs.end(),
                             model_states_inputs_.begin(),
                             model_states_inputs_.end());
-    augmented_outputs.insert(augmented_outputs.end(),
-                             model_states_outputs_.begin(),
-                             model_states_outputs_.end());
+    std::vector<uint32_t> augmented_outputs;
+    MapAndAddTensorIds(interpreter->outputs().data(),
+                       interpreter->outputs().size(), &augmented_outputs,
+                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(model_states_outputs_.data(),
+                       model_states_outputs_.size(), &augmented_outputs,
+                       tensor_id_to_nnapi_id);
 
     CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
         nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
         static_cast<uint32_t>(augmented_outputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
+
+    // TODO Support ANeuralNetworksModel_relaxComputationFloat32toFloat16
+    //if (GetAndroidSdkVersionCached() >= 28) {
+    //  CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+    //      nn_model_, interpreter->GetAllowFp16PrecisionForFp32()));
+    //}
     CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
   }
   if (!nn_compiled_model_) {
@@ -719,7 +943,13 @@ TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) {
 
 TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) {
   if (!nn_model_) {
-    TF_LITE_ENSURE_STATUS(BuildGraph(interpreter));
+    model_status_ = BuildGraph(interpreter);
+    if (model_status_ != kTfLiteOk) {
+      logError("Failed to build graph for NNAPI");
+    }
+  }
+  if (model_status_ != kTfLiteOk) {
+    return model_status_;
   }
 
   ANeuralNetworksExecution* execution = nullptr;
@@ -783,6 +1013,8 @@ TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) {
   return kTfLiteOk;
 }
 
+bool NNAPIDelegate::IsSupported() { return nnfw::NNAPIExists(); }
+
 } // namespace nnfw
 
 // clang-format on
diff --git a/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc b/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc
index 2a6a84e10..f434a6dec 100644
--- a/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc
@@ -27,14 +27,14 @@ namespace kernel
 namespace cpu
 {
 
-#define AVGPOOLING_PARAMETERS                               \
-  uint32_t height = getSizeOfDimension(_inputShape, 1);     \
-  uint32_t width = getSizeOfDimension(_inputShape, 2);      \
-  uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \
-  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);  \
-                                                            \
-  uint32_t paddingHeight = (uint32_t)_paddingTop;           \
-  uint32_t paddingWidth = (uint32_t)_paddingLeft;
+#define AVGPOOLING_PARAMETERS                            \
+  tflite::PoolParams op_params;                          \
+  op_params.stride_height = _strideHeight;               \
+  op_params.stride_width = _strideWidth;                 \
+  op_params.filter_height = _kernelHeight;               \
+  op_params.filter_width = _kernelWidth;                 \
+  op_params.padding_values.height = (int8_t)_paddingTop; \
+  op_params.padding_values.width = (int8_t)_paddingLeft;
 
 AvgPoolLayer::AvgPoolLayer()
     : _inputData(nullptr), _outputData(nullptr), _inputShape(), _outputShape(), _paddingLeft(0),
@@ -47,31 +47,31 @@ AvgPoolLayer::AvgPoolLayer()
 
 bool AvgPoolLayer::averagePoolFloat32()
 {
-
   AVGPOOLING_PARAMETERS
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::AveragePool(
-      reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), _strideWidth,
-      _strideHeight, paddingWidth, paddingHeight, _kernelWidth, _kernelHeight,
-      output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-      convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::AveragePool(op_params, convertShapeToTFLiteShape(_inputShape),
+                                       reinterpret_cast<const float *>(_inputData),
+                                       convertShapeToTFLiteShape(_outputShape),
+                                       reinterpret_cast<float *>(_outputData));
   return true;
 }
 bool AvgPoolLayer::averagePoolQuant8()
 {
-
   AVGPOOLING_PARAMETERS
   int32_t output_activation_min = 0;
   int32_t output_activation_max = 0;
   CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
                                 &output_activation_max);
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::AveragePool(_inputData, convertShapeToDims(_inputShape), _strideWidth,
-                                       _strideHeight, paddingWidth, paddingHeight, _kernelWidth,
-                                       _kernelHeight, output_activation_min, output_activation_max,
-                                       _outputData, convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::AveragePool(op_params, convertShapeToTFLiteShape(_inputShape),
+                                       _inputData, convertShapeToTFLiteShape(_outputShape),
+                                       _outputData);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc b/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc
index 5fe5e3993..be093b437 100644
--- a/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc
@@ -24,6 +24,7 @@ namespace neurun
 {
 namespace kernel
 {
+
 namespace cpu
 {
 
@@ -36,13 +37,21 @@ ConcatLayer::ConcatLayer()
 
 bool ConcatLayer::concatenationFloat32()
 {
-  int num_inputs = _inputShapes.size();
-  std::vector<::tflite::Dims<4> *> inputDimsPtr(num_inputs);
-  std::vector<::tflite::Dims<4>> inputDims(num_inputs);
-  for (int i = 0; i < num_inputs; i++)
+  uint32_t num_inputs = _inputShapes.size();
+
+  tflite::ConcatenationParams op_params;
+  op_params.axis = _axis;
+  op_params.inputs_count = num_inputs;
+
+  std::vector<::tflite::RuntimeShape *> inputDimsPtr;
+  std::vector<::tflite::RuntimeShape> inputDims;
+  inputDimsPtr.reserve(num_inputs);
+  inputDims.reserve(num_inputs);
+
+  for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims[i] = convertShapeToDims(_inputShapes[i]);
-    inputDimsPtr[i] = &inputDims[i];
+    inputDims.push_back(convertShapeToTFLiteShape(_inputShapes[i]));
+    inputDimsPtr.push_back(&inputDims[i]);
   }
 
   std::vector<const float *> inputFloatPtrs;
@@ -52,24 +61,44 @@ bool ConcatLayer::concatenationFloat32()
     inputFloatPtrs.emplace_back(reinterpret_cast<const float *>(ptr));
   }
 
-  ::tflite::optimized_ops::Concatenation<::tflite::FusedActivationFunctionType::kNone, float>(
-      getNumberOfDimensions(_outputShape) - _axis - 1, inputFloatPtrs.data(), inputDimsPtr.data(),
-      num_inputs, reinterpret_cast<float *>(_outputData), convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::Concatenation<float>(
+      op_params, inputDimsPtr.data(), inputFloatPtrs.data(),
+      convertShapeToTFLiteShape(_outputShape), reinterpret_cast<float *>(_outputData));
   return true;
 }
 bool ConcatLayer::concatenationQuant8()
 {
   int num_inputs = _inputShapes.size();
-  std::vector<::tflite::Dims<4> *> inputDimsPtr(num_inputs);
-  std::vector<::tflite::Dims<4>> inputDims(num_inputs);
-  for (int i = 0; i < num_inputs; i++)
+
+  std::vector<int32_t> input_zeropoints(num_inputs);
+  std::vector<float> input_scales(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims[i] = convertShapeToDims(_inputShapes[i]);
-    inputDimsPtr[i] = &inputDims[i];
+    input_zeropoints[i] = _inputShapes[i].offset;
+    input_scales[i] = _inputShapes[i].scale;
   }
-  ::tflite::optimized_ops::Concatenation<::tflite::FusedActivationFunctionType::kNone, uint8_t>(
-      getNumberOfDimensions(_outputShape) - _axis - 1, _inputDataPtrs.data(), inputDimsPtr.data(),
-      num_inputs, _outputData, convertShapeToDims(_outputShape));
+
+  tflite::ConcatenationParams op_params;
+  op_params.axis = _axis;
+  op_params.inputs_count = num_inputs;
+  op_params.input_zeropoint = input_zeropoints.data();
+  op_params.input_scale = input_scales.data();
+  op_params.output_zeropoint = _outputShape.offset;
+  op_params.output_scale = _outputShape.scale;
+
+  std::vector<::tflite::RuntimeShape *> inputDimsPtr;
+  std::vector<::tflite::RuntimeShape> inputDims;
+  inputDimsPtr.reserve(num_inputs);
+  inputDims.reserve(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; i++)
+  {
+    inputDims.push_back(convertShapeToTFLiteShape(_inputShapes[i]));
+    inputDimsPtr.push_back(&inputDims[i]);
+  }
+
+  ::tflite::optimized_ops::Concatenation<uint8_t>(
+      op_params, inputDimsPtr.data(), _inputDataPtrs.data(),
+      convertShapeToTFLiteShape(_outputShape), _outputData);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc b/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc
index 81e88e0f0..c694fa75f 100644
--- a/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc
@@ -33,55 +33,51 @@ static constexpr int kStaticBufferSize = 1605632;
 static char static_scratch_buffer[kStaticBufferSize];
 static std::mutex executionMutex;
 
-#define ANDROID_NN_CONV_PARAMETERS(Type)                                      \
-  uint32_t height = getSizeOfDimension(_inputShape, 1);                       \
-  uint32_t width = getSizeOfDimension(_inputShape, 2);                        \
-  uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1);                \
-  uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2);                 \
-  uint32_t outHeight = getSizeOfDimension(_outputShape, 1);                   \
-  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);                    \
-  uint32_t inDepth = getSizeOfDimension(_inputShape, 3);                      \
-                                                                              \
-  uint32_t paddingHeight = (uint32_t)_paddingTop;                             \
-  uint32_t paddingWidth = (uint32_t)_paddingLeft;                             \
-                                                                              \
-  ::tflite::Dims<4> im2colDim;                                                \
-  im2colDim.sizes[3] = (int)getSizeOfDimension(_outputShape, 0);              \
-  im2colDim.sizes[2] = (int)getSizeOfDimension(_outputShape, 1);              \
-  im2colDim.sizes[1] = (int)getSizeOfDimension(_outputShape, 2);              \
-  im2colDim.sizes[0] = (int)inDepth * kernelHeight * kernelWidth;             \
-                                                                              \
-  im2colDim.strides[0] = 1;                                                   \
-  for (int i = 1; i < 4; i++)                                                 \
-  {                                                                           \
-    im2colDim.strides[i] = im2colDim.strides[i - 1] * im2colDim.sizes[i - 1]; \
-  }                                                                           \
-  Type *im2colData = nullptr;                                                 \
-  uint64_t im2colByteSize = sizeof(Type);                                     \
-  std::unique_ptr<Type[]> im2colGuard;                                        \
-  for (int i = 0; i < 4; i++)                                                 \
-  {                                                                           \
-    im2colByteSize *= im2colDim.sizes[i];                                     \
-  }                                                                           \
-  /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */   \
-  if (im2colByteSize >= 0x7fffffff)                                           \
-  {                                                                           \
-    std::cout << "Conv size is too large, not enough memory" << std::endl;    \
-    return false;                                                             \
-  }                                                                           \
-  if (im2colByteSize <= kStaticBufferSize)                                    \
-  {                                                                           \
-    im2colData = reinterpret_cast<Type *>(static_scratch_buffer);             \
-  }                                                                           \
-  else                                                                        \
-  {                                                                           \
-    im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)];      \
-    if (im2colData == nullptr)                                                \
-    {                                                                         \
-      std::cout << "Conv size is too large, not enough memory" << std::endl;  \
-      return false;                                                           \
-    }                                                                         \
-    im2colGuard.reset(im2colData);                                            \
+#define ANDROID_NN_CONV_PARAMETERS(Type)                                     \
+  uint32_t height = getSizeOfDimension(_inputShape, 1);                      \
+  uint32_t width = getSizeOfDimension(_inputShape, 2);                       \
+  uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1);               \
+  uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2);                \
+  uint32_t outHeight = getSizeOfDimension(_outputShape, 1);                  \
+  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);                   \
+  uint32_t inDepth = getSizeOfDimension(_inputShape, 3);                     \
+                                                                             \
+  uint32_t paddingHeight = (uint32_t)_paddingTop;                            \
+  uint32_t paddingWidth = (uint32_t)_paddingLeft;                            \
+                                                                             \
+  Shape im2colShape;                                                         \
+  im2colShape.dimensions.resize(4);                                          \
+  im2colShape.dimensions[0] = getSizeOfDimension(_outputShape, 0);           \
+  im2colShape.dimensions[1] = getSizeOfDimension(_outputShape, 1);           \
+  im2colShape.dimensions[2] = getSizeOfDimension(_outputShape, 2);           \
+  im2colShape.dimensions[3] = inDepth * kernelHeight * kernelWidth;          \
+                                                                             \
+  Type *im2colData = nullptr;                                                \
+  uint64_t im2colByteSize = sizeof(Type);                                    \
+  std::unique_ptr<Type[]> im2colGuard;                                       \
+  for (int i = 0; i < 4; i++)                                                \
+  {                                                                          \
+    im2colByteSize *= im2colShape.dimensions[i];                             \
+  }                                                                          \
+  /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */  \
+  if (im2colByteSize >= 0x7fffffff)                                          \
+  {                                                                          \
+    std::cout << "Conv size is too large, not enough memory" << std::endl;   \
+    return false;                                                            \
+  }                                                                          \
+  if (im2colByteSize <= kStaticBufferSize)                                   \
+  {                                                                          \
+    im2colData = reinterpret_cast<Type *>(static_scratch_buffer);            \
+  }                                                                          \
+  else                                                                       \
+  {                                                                          \
+    im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)];     \
+    if (im2colData == nullptr)                                               \
+    {                                                                        \
+      std::cout << "Conv size is too large, not enough memory" << std::endl; \
+      return false;                                                          \
+    }                                                                        \
+    im2colGuard.reset(im2colData);                                           \
   }
 
 ConvolutionLayer::ConvolutionLayer()
@@ -112,19 +108,32 @@ bool ConvolutionLayer::convFloat32()
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
   int32_t dilationWidthFactor = 1, dilationHeightFactor = 1;
+
+  ::tflite::ConvParams op_params;
+  op_params.padding_type = ::tflite::PaddingType::kSame;
+  op_params.padding_values.width = paddingWidth;
+  op_params.padding_values.height = paddingHeight;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = dilationWidthFactor;
+  op_params.dilation_height_factor = dilationHeightFactor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
   ::tflite::optimized_ops::Conv(
-      reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape),
-      reinterpret_cast<const float *>(_kernelData), convertShapeToDims(_kernelShape),
-      reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape), _strideWidth,
-      _strideHeight, dilationWidthFactor, dilationHeightFactor, paddingWidth, paddingHeight,
-      output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-      convertShapeToDims(_outputShape), im2colDataToPass, im2colDim);
+      op_params, convertShapeToTFLiteShape(_inputShape),
+      reinterpret_cast<const float *>(_inputData), convertShapeToTFLiteShape(_kernelShape),
+      reinterpret_cast<const float *>(_kernelData), convertShapeToTFLiteShape(_biasShape),
+      reinterpret_cast<const float *>(_biasData), convertShapeToTFLiteShape(_outputShape),
+      reinterpret_cast<float *>(_outputData), convertShapeToTFLiteShape(im2colShape),
+      im2colDataToPass);
   return true;
 }
 
 bool ConvolutionLayer::convQuant8()
 {
   ANDROID_NN_CONV_PARAMETERS(uint8_t)
+
   int32_t inputOffset = -_inputShape.offset;
   int32_t kernelOffset = -_kernelShape.offset;
   int32_t outputOffset = _outputShape.offset;
@@ -141,6 +150,24 @@ bool ConvolutionLayer::convQuant8()
   }
   CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
                                 &output_activation_max);
+  int32_t dilationWidthFactor = 1, dilationHeightFactor = 1;
+
+  ::tflite::ConvParams op_params;
+  op_params.padding_type = ::tflite::PaddingType::kSame;
+  op_params.padding_values.width = paddingWidth;
+  op_params.padding_values.height = paddingHeight;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = dilationWidthFactor;
+  op_params.dilation_height_factor = dilationHeightFactor;
+  op_params.input_offset = inputOffset;
+  op_params.weights_offset = kernelOffset;
+  op_params.output_offset = outputOffset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
   static gemmlowp::GemmContext gemm_context;
   // Prevent concurrent executions that may access the scratch buffer and
   // gemm_context.
@@ -148,11 +175,10 @@ bool ConvolutionLayer::convQuant8()
   // Alow gemmlowp automatically decide how many threads to use.
   gemm_context.set_max_num_threads(0);
   ::tflite::optimized_ops::Conv(
-      _inputData, convertShapeToDims(_inputShape), inputOffset, _kernelData,
-      convertShapeToDims(_kernelShape), kernelOffset, reinterpret_cast<const int32_t *>(_biasData),
-      convertShapeToDims(_biasShape), _strideWidth, _strideHeight, paddingWidth, paddingHeight,
-      outputOffset, output_multiplier, output_shift, output_activation_min, output_activation_max,
-      _outputData, convertShapeToDims(_outputShape), im2colData, im2colDim, &gemm_context);
+      op_params, convertShapeToTFLiteShape(_inputShape), _inputData,
+      convertShapeToTFLiteShape(_kernelShape), _kernelData, convertShapeToTFLiteShape(_biasShape),
+      reinterpret_cast<const int32_t *>(_biasData), convertShapeToTFLiteShape(_outputShape),
+      _outputData, convertShapeToTFLiteShape(im2colShape), im2colData, &gemm_context);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc b/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc
index 41b9afc0c..abe82db5e 100644
--- a/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc
@@ -44,64 +44,39 @@ FullyConnectedLayer::FullyConnectedLayer()
 static std::mutex executionMutex;
 bool FullyConnectedLayer::fullyConnectedFloat32()
 {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
-  // b/80425683, optimized implementation produces incorrect results when the
-  // number of input elements is the squre of batch_size.
-  uint32_t batch_size = getSizeOfDimension(_outputShape, 0);
-  uint32_t input_n_elements = getNumberOfElements(_inputShape);
-  if (batch_size * batch_size == input_n_elements)
+  int total_input_size = 1;
+  for (int i = 0; i < _inputShape.dimensions.size(); i++)
   {
-    ::tflite::reference_ops::FullyConnected(
-        reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape),
-        reinterpret_cast<const float *>(_weightsData), convertShapeToDims(_weightsShape),
-        reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape),
-        output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-        convertShapeToDims(_outputShape));
-  }
-  else
-  {
-    ::tflite::optimized_ops::FullyConnected(
-        reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape),
-        reinterpret_cast<const float *>(_weightsData), convertShapeToDims(_weightsShape),
-        reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape),
-        output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-        convertShapeToDims(_outputShape));
+    total_input_size *= _inputShape.dimensions[i];
   }
+
+  int input_size = _weightsShape.dimensions[1];
+  const int batch_size = total_input_size / input_size;
+  const int num_units = _weightsShape.dimensions[0];
+
+  TfLiteFusedActivation act = convertFusedActivation(_activation);
+
+  ::tflite::tensor_utils::VectorBatchVectorAssign(reinterpret_cast<const float *>(_biasData),
+                                                  num_units, batch_size,
+                                                  reinterpret_cast<float *>(_outputData));
+
+  // Compute output += weight * input
+  ::tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      reinterpret_cast<const float *>(_weightsData), num_units, input_size,
+      reinterpret_cast<const float *>(_inputData), batch_size,
+      reinterpret_cast<float *>(_outputData), /*result_stride=*/1);
+
+  // Apply activation function
+  ::tflite::tensor_utils::ApplyActivationToVector(reinterpret_cast<float *>(_outputData),
+                                                  batch_size * num_units, act,
+                                                  reinterpret_cast<float *>(_outputData));
+
   return true;
 }
 
 bool FullyConnectedLayer::fullyConnectedQuant8()
 {
-  int32_t inputOffset = -_inputShape.offset;
-  int32_t weightsOffset = -_weightsShape.offset;
-  int32_t outputOffset = _outputShape.offset;
-  float real_multiplier = 0.0;
-  int32_t output_multiplier = 0;
-  int32_t output_shift = 0;
-  int32_t output_activation_min = 0;
-  int32_t output_activation_max = 0;
-  // Caution : 'Convolution' can make misleading. It seems it is just math term.
-  if (!GetQuantizedConvolutionMultipler(_inputShape, _weightsShape, _biasShape, _outputShape,
-                                        &real_multiplier) ||
-      !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, &output_shift))
-  {
-    return false;
-  }
-  CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
-                                &output_activation_max);
-  static gemmlowp::GemmContext gemm_context;
-  // Prevent concurrent executions that access gemm_context.
-  std::unique_lock<std::mutex> lock(executionMutex);
-  // Alow gemmlowp automatically decide how many threads to use.
-  gemm_context.set_max_num_threads(0);
-  ::tflite::optimized_ops::FullyConnected(
-      _inputData, convertShapeToDims(_inputShape), inputOffset, _weightsData,
-      convertShapeToDims(_weightsShape), weightsOffset,
-      reinterpret_cast<const int32_t *>(_biasData), convertShapeToDims(_biasShape), outputOffset,
-      output_multiplier, output_shift, output_activation_min, output_activation_max, _outputData,
-      convertShapeToDims(_outputShape), &gemm_context);
-  return true;
+  throw std::runtime_error{"FullyConnectedLayer : Not tested for TENSOR_QUANT8_ASYMM"};
 }
 
 void FullyConnectedLayer::configure(uint8_t *inputData, const Shape inputShape,
diff --git a/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc b/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc
index 3d96bb401..c4a288b07 100644
--- a/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc
@@ -26,14 +26,14 @@ namespace kernel
 namespace cpu
 {
 
-#define MAXPOOLING_PARAMETERS                               \
-  uint32_t height = getSizeOfDimension(_inputShape, 1);     \
-  uint32_t width = getSizeOfDimension(_inputShape, 2);      \
-  uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \
-  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);  \
-                                                            \
-  uint32_t paddingHeight = (uint32_t)_paddingTop;           \
-  uint32_t paddingWidth = (uint32_t)_paddingLeft;
+#define MAXPOOLING_PARAMETERS                            \
+  tflite::PoolParams op_params;                          \
+  op_params.stride_height = _strideHeight;               \
+  op_params.stride_width = _strideWidth;                 \
+  op_params.filter_height = _kernelHeight;               \
+  op_params.filter_width = _kernelWidth;                 \
+  op_params.padding_values.height = (int8_t)_paddingTop; \
+  op_params.padding_values.width = (int8_t)_paddingLeft;
 
 MaxPoolLayer::MaxPoolLayer()
     : _inputData(nullptr), _outputData(nullptr), _inputShape(), _outputShape(), _paddingLeft(0),
@@ -46,31 +46,30 @@ MaxPoolLayer::MaxPoolLayer()
 
 bool MaxPoolLayer::maxPoolFloat32()
 {
-
   MAXPOOLING_PARAMETERS
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::MaxPool(
-      reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), _strideWidth,
-      _strideHeight, paddingWidth, paddingHeight, _kernelWidth, _kernelHeight,
-      output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-      convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::MaxPool(op_params, convertShapeToTFLiteShape(_inputShape),
+                                   reinterpret_cast<const float *>(_inputData),
+                                   convertShapeToTFLiteShape(_outputShape),
+                                   reinterpret_cast<float *>(_outputData));
   return true;
 }
 bool MaxPoolLayer::maxPoolQuant8()
 {
-
   MAXPOOLING_PARAMETERS
   int32_t output_activation_min = 0;
   int32_t output_activation_max = 0;
   CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
                                 &output_activation_max);
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::MaxPool(_inputData, convertShapeToDims(_inputShape), _strideWidth,
-                                   _strideHeight, paddingWidth, paddingHeight, _kernelWidth,
-                                   _kernelHeight, output_activation_min, output_activation_max,
-                                   _outputData, convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::MaxPool(op_params, convertShapeToTFLiteShape(_inputShape), _inputData,
+                                   convertShapeToTFLiteShape(_outputShape), _outputData);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/OperationUtils.h b/runtimes/neurun/src/kernel/cpu/OperationUtils.h
index 5914d04e3..066b1e573 100644
--- a/runtimes/neurun/src/kernel/cpu/OperationUtils.h
+++ b/runtimes/neurun/src/kernel/cpu/OperationUtils.h
@@ -23,7 +23,9 @@
 #include <limits>
 #include <vector>
 
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "graph/operand/Object.h"
 #include "graph/operand/DataType.h"
 
@@ -75,6 +77,51 @@ inline ::tflite::Dims<4> convertShapeToDims(const Shape &shape)
   return dims;
 }
 
+inline ::tflite::RuntimeShape convertShapeToTFLiteShape(const Shape &shape)
+{
+  std::vector<int32_t> raw_shape;
+  raw_shape.resize(4);
+
+  for (uint32_t i = 0; i < 4; ++i)
+  {
+    if (i >= shape.dimensions.size())
+    {
+      raw_shape[i] = 1;
+    }
+    else
+    {
+      raw_shape[i] = shape.dimensions[i];
+    }
+  }
+
+  return ::tflite::GetTensorShape(raw_shape);
+}
+
+inline TfLiteFusedActivation convertFusedActivation(FuseCode act)
+{
+  if (act == ANEURALNETWORKS_FUSED_NONE)
+  {
+    return kTfLiteActNone;
+  }
+
+  if (act == ANEURALNETWORKS_FUSED_RELU)
+  {
+    return kTfLiteActRelu;
+  }
+
+  if (act == ANEURALNETWORKS_FUSED_RELU1)
+  {
+    return kTfLiteActRelu1;
+  }
+
+  if (act == ANEURALNETWORKS_FUSED_RELU6)
+  {
+    return kTfLiteActRelu6;
+  }
+
+  return kTfLiteActNone;
+}
+
 __wur bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier,
                                             int32_t *right_shift);
 
diff --git a/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc b/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc
index 4f5a69f2e..c998c65f6 100644
--- a/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc
@@ -33,45 +33,86 @@ SoftMaxLayer::SoftMaxLayer()
   // DO NOTHING
 }
 
+// Performs softmax along the input of size (input_size * batch_size).
+void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
+             float *out)
+{
+  TF_LITE_ASSERT(input_size > 0);
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++)
+  {
+    // Find the max coeff.
+    float max_coeff = in[0];
+    for (int i = 1; i < input_size; i++)
+    {
+      if (in[i] > max_coeff)
+        max_coeff = in[i];
+    }
+
+    // Compute the normalized sum of exps.
+    float exp_sum = 0.0;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] = std::exp((in[i] - max_coeff) * beta);
+      exp_sum += out[i];
+    }
+
+    // Divide by the sum of exps.
+    float reciprocal_sum_exp = 1.f / exp_sum;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] *= reciprocal_sum_exp;
+    }
+
+    // Advance in and out pointers for the next batch.
+    in += input_size;
+    out += input_size;
+  }
+}
+
 bool SoftMaxLayer::softmaxFloat32()
 {
-  ::tflite::Dims<4> dim;
+  Shape shapeIn4D;
+
   if (getNumberOfDimensions(_inputShape) == 2)
   {
     uint32_t batch_size = getSizeOfDimension(_inputShape, 0);
     uint32_t input_size = getNumberOfElements(_inputShape) / batch_size;
-    Shape shapeIn4D;
-    shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
-    dim = convertShapeToDims(shapeIn4D);
+    Softmax(reinterpret_cast<const float *>(_inputData), input_size, batch_size, _beta,
+            reinterpret_cast<float *>(_outputData));
   }
   else if (getNumberOfDimensions(_inputShape) == 4)
   {
-    dim = convertShapeToDims(_inputShape);
+    ::tflite::SoftmaxParams op_params;
+    op_params.beta = _beta;
+    ::tflite::optimized_ops::Softmax(op_params, convertShapeToTFLiteShape(_inputShape),
+                                     reinterpret_cast<const float *>(_inputData),
+                                     convertShapeToTFLiteShape(_outputShape),
+                                     reinterpret_cast<float *>(_outputData));
   }
   else
   {
     std::cout << "only 2D and 4D tensors supported" << std::endl;
     return false;
   }
-  ::tflite::optimized_ops::Softmax(reinterpret_cast<const float *>(_inputData), dim, _beta,
-                                   reinterpret_cast<float *>(_outputData), dim);
+
   return true;
 }
 
 bool SoftMaxLayer::softmaxQuant8()
 {
-  ::tflite::Dims<4> dim;
+  Shape shapeIn4D = _inputShape;
+
   if (getNumberOfDimensions(_inputShape) == 2)
   {
     uint32_t batch_size = getSizeOfDimension(_inputShape, 0);
     uint32_t input_size = getNumberOfElements(_inputShape) / batch_size;
-    Shape shapeIn4D;
     shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
-    dim = convertShapeToDims(shapeIn4D);
   }
   else if (getNumberOfDimensions(_inputShape) == 4)
   {
-    dim = convertShapeToDims(_inputShape);
+    shapeIn4D = _inputShape;
   }
   else
   {
@@ -94,8 +135,13 @@ bool SoftMaxLayer::softmaxQuant8()
     return false;
   }
   float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift);
-  ::tflite::optimized_ops::Softmax(_inputData, dim, input_multiplier, input_left_shift, diff_min,
-                                   _outputData, dim);
+
+  ::tflite::SoftmaxParams op_params;
+  op_params.input_multiplier = input_multiplier;
+  op_params.input_left_shift = input_left_shift;
+  op_params.diff_min = diff_min;
+  ::tflite::optimized_ops::Softmax(op_params, convertShapeToTFLiteShape(shapeIn4D), _inputData,
+                                   convertShapeToTFLiteShape(shapeIn4D), _outputData);
   return true;
 }