85 files changed, 9678 insertions, 1409 deletions
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
index 609dd45a3..d464dccae 100644
--- a/compute/cker/CMakeLists.txt
+++ b/compute/cker/CMakeLists.txt
@@ -8,15 +8,33 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
 target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
-if(EXPERIMENTAL_RUY_FEATURE)
-  target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
-endif(EXPERIMENTAL_RUY_FEATURE)
 if(PROFILE_RUY)
   target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
 endif(PROFILE_RUY)
 
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+  target_compile_definitions(nnfw_lib_cker INTERFACE CKER_X86_PLATFORM)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+
 target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 # Workaround to avoid warning
 # TODO Resolve warning
 target_compile_options(nnfw_lib_cker INTERFACE -Wno-attributes)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+set(TEST_CKER test_cker)
+
+file(GLOB_RECURSE TESTS "src/*.test.cc")
+
+add_executable(${TEST_CKER} ${TESTS})
+
+target_link_libraries(${TEST_CKER} nnfw_lib_cker)
+target_link_libraries(${TEST_CKER} nnfw_coverage)
+target_link_libraries(${TEST_CKER} gtest gtest_main ${LIB_PTHREAD})
+
+add_test(${TEST_CKER} ${TEST_CKER})
+install(TARGETS ${TEST_CKER} DESTINATION unittest)
diff --git a/compute/cker/include/cker/CpuBackendThreadpool.h b/compute/cker/include/cker/CpuBackendThreadpool.h
new file mode 100644
index 000000000..8ec6140bd
--- /dev/null
+++ b/compute/cker/include/cker/CpuBackendThreadpool.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+#define __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+
+#include <ruy/context.h>     // from @ruy
+#include <ruy/thread_pool.h> // from @ruy
+
+#include <stdexcept>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace cpu_backend_threadpool
+{
+
+using Task = ruy::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context)
+{
+  assert(ruy_context != nullptr);
+  assert(tasks_count <= ruy_context->max_num_threads());
+  if (ruy_context == nullptr)
+  {
+    throw std::runtime_error("CpuBackendThreadpool.h: ruy::Context is null");
+  }
+  ruy_context->mutable_thread_pool()->Execute(tasks_count, tasks);
+}
+
+} // namespace cpu_backend_threadpool
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
index 246fd9a46..45ad969c3 100644
--- a/compute/cker/include/cker/NeonTensorUtils.h
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -20,11 +20,13 @@
 
 #include <ruy/path.h>
 #include <ruy/ruy.h>
-#include <ruy/detect_arm.h>
 #include "cker/Types.h"
 #include "cker/neon/neon_check.h"
 #include "cker/ruy/RuySupport.h"
 #include "util/logging.h"
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
 
 #include <cassert>
 #include <cmath>
@@ -41,6 +43,8 @@ namespace cker
 namespace
 {
 
+constexpr int kFloatValuesPerNeonVector = 4;
+
 // TODO(ahentz): Clean up.
 using int8 = std::int8_t;
 using uint8 = std::uint8_t;
@@ -49,6 +53,11 @@ using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
 
+template <int PerNeonSize> inline int RoundDownVectors(int size)
+{
+  return size & ~(PerNeonSize - 1);
+}
+
 // Allocates, at least, size bytes of uninitialized storage whose alignment is
 // specified by alignment. The size parameter must be an integral multiple of
 // alignment.
@@ -73,14 +82,37 @@ inline int32_t AccumulateNeonLane(const int32x4_t lane)
 
 } // namespace
 
-#ifdef __aarch64__
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+inline bool DetectDotprodByLinuxAuxvMethod()
+{
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+inline bool DetectArmNeonDotprod()
+{
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
 
-bool HasSdotInstruction()
+  return false;
+}
+
+inline bool HasSdotInstruction()
 {
-  static const bool has_dotprod = ruy::DetectDotprod();
+  static const bool has_dotprod = DetectArmNeonDotprod();
   return has_dotprod;
 }
 
+#ifdef __aarch64__
 // We interleave vector data to make the dot product logic more efficient.
 // Suppose that vectors is:
 //     a0 a1 a2 a3 a4 a5 ...
@@ -93,13 +125,13 @@ bool HasSdotInstruction()
 //     e0 e1 e2 e3 f0 f1 f2 f3 ...
 // Once the data is interleaved, each 16-byte read from the vectors pointer
 // contains 4 bytes from each of 4 vectors.
-const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols,
-                             void **shuffled_vectors_free)
+inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols,
+                                    void **shuffled_vectors_free)
 {
   const int kWeightsPerUint32 = 4;
 
   int8 *shuffled_vectors = reinterpret_cast<int8 *>(
-      aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
+    aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
 
   for (int i = 0; i < n_batch; i += 4)
   {
@@ -113,25 +145,25 @@ const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int
     while (unshuffled_vec0_ptr != end_vec0_ptr)
     {
       asm volatile(
-          // This code path requires that (n_cols % 16) == 0 so we can safely
-          // read in 16-byte chunks from each row.
-          "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
-          "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
-          "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
-          "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
-
-          "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
-
-          : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
-            [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
-            [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
-            [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
-            [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
-          :
-          : "v0", "v1", "v2", "v3", "cc", "memory");
+        // This code path requires that (n_cols % 16) == 0 so we can safely
+        // read in 16-byte chunks from each row.
+        "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
+        "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
+        "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
+        "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
+
+        "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
+
+        : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr),
+          [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr),
+          [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr),
+          [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr),
+          [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr)
+        :
+        : "v0", "v1", "v2", "v3", "cc", "memory");
     }
   }
 
@@ -172,104 +204,104 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
       const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols);
 
       asm volatile(
-          // Zero out the accumulator registers.
-          "dup v0.4s, wzr\n"
-          "dup v1.4s, wzr\n"
-          "dup v2.4s, wzr\n"
-          "dup v3.4s, wzr\n"
-
-          "1:\n" // batch_cols_loop
-
-          // Read 16 more bytes from a pair of matrix rows.
-          "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-
-          // Prefetch two rows ahead.
-          "prfm pldl1strm, [%[mat_ptr2]]\n"
-          "prfm pldl1strm, [%[mat_ptr3]]\n"
-
-          // Read from input vectors 4 times; 64 bytes total.
-          // Each 16-byte register contains parts of 4 vectors; see the
-          // shuffle logic above.
-
-          // From Benoit, places to look in the future:
-          // - Move load instructions further from sdot
-          // - Switch loop use-then-reload
-          // - Do partial unrolling to use register space better
-          "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
-          "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
-          "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
-          "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
-
-          // Update prefetch pointers.
-          "add %[mat_ptr2], %[mat_ptr2], #16\n"
-          "add %[mat_ptr3], %[mat_ptr3], #16\n"
-
-          // Re-use those vectors for the next row as well.
-          "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
-          ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
-          ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
-          ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
-          ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-
-          // If we're not done with these rows, continue.
-          "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-          "bne 1b\n" // batch_cols_loop
-
-          // Done with the rows, sum the results.
-          "add v0.4s, v0.4s, v1.4s\n"
-          "add v2.4s, v2.4s, v3.4s\n"
-
-          // Convert the per-vector sums to floating point.
-          "scvtf v0.4s, v0.4s\n"
-          "scvtf v1.4s, v2.4s\n"
-
-          // Fetch scale factors.
-          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-
-          // Multiply scale factors times sums.
-          "fmul v0.4s, v4.4s, v0.4s\n"
-          "fmul v1.4s, v4.4s, v1.4s\n"
-
-          // Load previous result values.
-          // The result position is:
-          //   result[batch * m_rows + row]
-          // Here that is factored into:
-          //   result_ptr = result + row
-          //   *result_ptr = res[0]
-          //   (uint8*)result_ptr += (m_rows * sizeof(float))
-          //   *result_ptr = res[1]
-          //   ...
-          // Since we're reading two rows at a time, though, we read both
-          //   result[batch * m_rows + row]
-          // and
-          //   result[batch * m_rows + row + 1]
-          "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-
-          // Go back to the starting position (subtract wide_rows * 4).
-          "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-
-          // Add previous result values.
-          "fadd v9.4s, v9.4s, v0.4s\n"
-          "fadd v10.4s, v10.4s, v1.4s\n"
-
-          // Store results.
-          "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-          : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
-            [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
-          : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
-            [wide_rows] "r"(wide_rows)
-          : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-            "v13", "cc", "memory");
+        // Zero out the accumulator registers.
+        "dup v0.4s, wzr\n"
+        "dup v1.4s, wzr\n"
+        "dup v2.4s, wzr\n"
+        "dup v3.4s, wzr\n"
+
+        "1:\n" // batch_cols_loop
+
+        // Read 16 more bytes from a pair of matrix rows.
+        "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+
+        // Prefetch two rows ahead.
+        "prfm pldl1strm, [%[mat_ptr2]]\n"
+        "prfm pldl1strm, [%[mat_ptr3]]\n"
+
+        // Read from input vectors 4 times; 64 bytes total.
+        // Each 16-byte register contains parts of 4 vectors; see the
+        // shuffle logic above.
+
+        // From Benoit, places to look in the future:
+        // - Move load instructions further from sdot
+        // - Switch loop use-then-reload
+        // - Do partial unrolling to use register space better
+        "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+        "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+        "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+        "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+
+        // Update prefetch pointers.
+        "add %[mat_ptr2], %[mat_ptr2], #16\n"
+        "add %[mat_ptr3], %[mat_ptr3], #16\n"
+
+        // Re-use those vectors for the next row as well.
+        "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+        ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+        ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+        ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+        ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+
+        // If we're not done with these rows, continue.
+        "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+        "bne 1b\n" // batch_cols_loop
+
+        // Done with the rows, sum the results.
+        "add v0.4s, v0.4s, v1.4s\n"
+        "add v2.4s, v2.4s, v3.4s\n"
+
+        // Convert the per-vector sums to floating point.
+        "scvtf v0.4s, v0.4s\n"
+        "scvtf v1.4s, v2.4s\n"
+
+        // Fetch scale factors.
+        "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+
+        // Multiply scale factors times sums.
+        "fmul v0.4s, v4.4s, v0.4s\n"
+        "fmul v1.4s, v4.4s, v1.4s\n"
+
+        // Load previous result values.
+        // The result position is:
+        //   result[batch * m_rows + row]
+        // Here that is factored into:
+        //   result_ptr = result + row
+        //   *result_ptr = res[0]
+        //   (uint8*)result_ptr += (m_rows * sizeof(float))
+        //   *result_ptr = res[1]
+        //   ...
+        // Since we're reading two rows at a time, though, we read both
+        //   result[batch * m_rows + row]
+        // and
+        //   result[batch * m_rows + row + 1]
+        "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+
+        // Go back to the starting position (subtract wide_rows * 4).
+        "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+
+        // Add previous result values.
+        "fadd v9.4s, v9.4s, v0.4s\n"
+        "fadd v10.4s, v10.4s, v1.4s\n"
+
+        // Store results.
+        "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+          [ result_ptr ] "+r"(result_ptr), [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3)
+        : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+          [ wide_rows ] "r"(wide_rows)
+        : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+          "v13", "cc", "memory");
     }
   }
 
@@ -277,9 +309,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
 }
 
 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result,
-    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result,
+  const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
 {
   void *shuffled_vectors_free;
   const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
@@ -300,102 +332,102 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
       const int32_t *batch_offsets_ptr = input_offset + batch;
       const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
       const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
-      asm volatile("dup v0.4s, wzr\n"
-                   "dup v1.4s, wzr\n"
-                   "dup v2.4s, wzr\n"
-                   "dup v3.4s, wzr\n"
-                   // Load zero points.
-                   "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
-                   "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-                   // Zero out zero point accumulators.
-                   "dup v14.4s, wzr\n"
-                   "dup v15.4s, wzr\n"
-
-                   // Load per channel scales if not null.
-                   "cmp %w[is_channel_scale_nullptr], #0\n"
-                   "bne 1f\n"
-                   "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
-                   "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
-                   "fmul v16.4s, v16.4s, v4.4s\n"
-                   "fmul v17.4s, v17.4s, v4.4s\n"
-                   "b 2f\n"
-                   "1:\n"
-                   "mov v16.16b, v4.16b\n"
-                   "mov v17.16b, v4.16b\n"
-                   "2:\n"
-                   "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-                   "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
-                   "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
-                   "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
-                   "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
-                   "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
-                   ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
-                   ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
-                   ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
-                   ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-                   "cmp %w[is_row_sums_nullptr], #1\n"
-                   "bne 3f\n"
-                   // Accumulate row_sums for zero point calculations.
-                   "saddlp v12.8h, v12.16b\n"
-                   "saddlp v13.8h, v13.16b\n"
-                   "sadalp v14.4s, v12.8h\n"
-                   "sadalp v15.4s, v13.8h\n"
-                   "3:\n"
-                   "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-                   "bne 2b\n"
-                   "add v0.4s, v0.4s, v1.4s\n"
-                   "add v2.4s, v2.4s, v3.4s\n"
-
-                   "cmp %w[is_row_sums_nullptr], #1\n"
-                   "bne 4f\n"
-                   // Calculate zero point offsets.
-                   "addv s14, v14.4s\n"
-                   "addv s15, v15.4s\n"
-                   "dup v14.4s, v14.s[0]\n"
-                   "dup v15.4s, v15.s[0]\n"
-                   "b 5f\n"
-                   "4:\n"
-                   "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
-                   "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
-                   "5:\n"
-
-                   "mul v14.4s, v14.4s, v7.4s\n"
-                   "mul v15.4s, v15.4s, v7.4s\n"
-                   "sub v0.4s, v0.4s, v14.4s\n"
-                   "sub v2.4s, v2.4s, v15.4s\n"
-
-                   "scvtf v0.4s, v0.4s\n"
-                   "scvtf v1.4s, v2.4s\n"
-
-                   // Multiply scale.
-                   "fmul v0.4s, v16.4s, v0.4s\n"
-                   "fmul v1.4s, v17.4s, v1.4s\n"
-
-                   "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-                   "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-                   "fadd v9.4s, v9.4s, v0.4s\n"
-                   "fadd v10.4s, v10.4s, v1.4s\n"
-                   "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-                   : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
-                     [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr)
-                   : [mat_ptr0_end] "r"(mat_ptr0_end),
-                     [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows),
-                     [channel_scales_ptr] "r"(channel_scales_ptr),
-                     [batch_offsets_ptr] "r"(batch_offsets_ptr),
-                     [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
-                     [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
-                   : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-                     "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
+      asm volatile(
+        "dup v0.4s, wzr\n"
+        "dup v1.4s, wzr\n"
+        "dup v2.4s, wzr\n"
+        "dup v3.4s, wzr\n"
+        // Load zero points.
+        "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
+        "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+        // Zero out zero point accumulators.
+        "dup v14.4s, wzr\n"
+        "dup v15.4s, wzr\n"
+
+        // Load per channel scales if not null.
+        "cmp %w[is_channel_scale_nullptr], #0\n"
+        "bne 1f\n"
+        "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
+        "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
+        "fmul v16.4s, v16.4s, v4.4s\n"
+        "fmul v17.4s, v17.4s, v4.4s\n"
+        "b 2f\n"
+        "1:\n"
+        "mov v16.16b, v4.16b\n"
+        "mov v17.16b, v4.16b\n"
+        "2:\n"
+        "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+        "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+        "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+        "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+        "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+        "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+        ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+        ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+        ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+        ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+        "cmp %w[is_row_sums_nullptr], #1\n"
+        "bne 3f\n"
+        // Accumulate row_sums for zero point calculations.
+        "saddlp v12.8h, v12.16b\n"
+        "saddlp v13.8h, v13.16b\n"
+        "sadalp v14.4s, v12.8h\n"
+        "sadalp v15.4s, v13.8h\n"
+        "3:\n"
+        "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+        "bne 2b\n"
+        "add v0.4s, v0.4s, v1.4s\n"
+        "add v2.4s, v2.4s, v3.4s\n"
+
+        "cmp %w[is_row_sums_nullptr], #1\n"
+        "bne 4f\n"
+        // Calculate zero point offsets.
+        "addv s14, v14.4s\n"
+        "addv s15, v15.4s\n"
+        "dup v14.4s, v14.s[0]\n"
+        "dup v15.4s, v15.s[0]\n"
+        "b 5f\n"
+        "4:\n"
+        "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
+        "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
+        "5:\n"
+
+        "mul v14.4s, v14.4s, v7.4s\n"
+        "mul v15.4s, v15.4s, v7.4s\n"
+        "sub v0.4s, v0.4s, v14.4s\n"
+        "sub v2.4s, v2.4s, v15.4s\n"
+
+        "scvtf v0.4s, v0.4s\n"
+        "scvtf v1.4s, v2.4s\n"
+
+        // Multiply scale.
+        "fmul v0.4s, v16.4s, v0.4s\n"
+        "fmul v1.4s, v17.4s, v1.4s\n"
+
+        "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+        "fadd v9.4s, v9.4s, v0.4s\n"
+        "fadd v10.4s, v10.4s, v1.4s\n"
+        "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+          [ result_ptr ] "+r"(result_ptr), [ row_sums_ptr ] "+r"(row_sums_ptr)
+        : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+          [ wide_rows ] "r"(wide_rows), [ channel_scales_ptr ] "r"(channel_scales_ptr),
+          [ batch_offsets_ptr ] "r"(batch_offsets_ptr),
+          [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
+          [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
+        : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+          "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
     }
   }
 
@@ -425,10 +457,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
 //
 // We don't use this kernel when n_batch = 1 because the baseline kernel
 // is fine for that case.
-void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result,
-    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result,
+  const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
 {
   const int kWeightsPerUint32 = 4;
 
@@ -443,14 +475,14 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   void *padded_vectors_free;
   const int padded_vectors_size = batch_round_up * m_cols;
   int8_t *padded_vectors = reinterpret_cast<int8_t *>(
-      aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
+    aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
   memset(padded_vectors, 0, padded_vectors_size);
 
   void *padded_result_free;
   const int result_size = n_batch * m_rows * sizeof(float);
   const int padded_result_size = batch_round_up * m_rows * sizeof(float);
   float *padded_result = reinterpret_cast<float *>(
-      aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
+    aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
   memcpy(padded_result, result, result_size);
   memset(reinterpret_cast<char *>(padded_result) + result_size, 0,
          padded_result_size - result_size);
@@ -462,7 +494,7 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   void *padded_scaling_factors_free;
   const int padded_scaling_factors_size = batch_round_up * sizeof(float);
   float *padded_scaling_factors = reinterpret_cast<float *>(
-      aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
+    aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
   assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size);
   assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size);
   memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
@@ -473,7 +505,7 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
     void *padded_input_offset_free;
     const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
     int32_t *padded_input_offset = reinterpret_cast<int32_t *>(
-        aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
+      aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
     assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size);
     assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size);
     memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
@@ -481,8 +513,8 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
 
     // Call the main kernel.
     DotprodMatrixBatchFourVectorMultiplyAccumulate(
-        matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up,
-        padded_result, per_channel_scale, padded_input_offset, row_sums);
+      matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result,
+      per_channel_scale, padded_input_offset, row_sums);
 
     free(padded_input_offset_free);
   }
@@ -500,20 +532,40 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   free(padded_scaling_factors_free);
 }
 
-void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
-                                                          const int m_rows, const int m_cols,
-                                                          const int8_t *vectors,
-                                                          const float *scaling_factors, int n_batch,
-                                                          float *__restrict__ result)
+inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result)
 {
   DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
-      /*row_sums=*/nullptr);
+    matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+    /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+    /*row_sums=*/nullptr);
 }
 #endif // __aarch64__
 
-bool NeonIsZeroVector(const float *vector, int v_size)
+inline void NeonCwiseClipping(float *vector, const int v_size, const float clipping_value)
+{
+  const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
+  const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
+
+  int i = 0;
+  for (; i <= v_size - kFloatValuesPerNeonVector; i += kFloatValuesPerNeonVector)
+  {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + i);
+    // Clip between clipping_value and -clipping_value.
+    v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
+    v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(vector + i, v_f32x4);
+  }
+  for (; i < v_size; i++)
+  {
+    vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
+  }
+}
+
+inline bool NeonIsZeroVector(const float *vector, int v_size)
 {
   // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
   // use the main vectorized loop, and we need to process sequentially.
@@ -544,15 +596,16 @@ bool NeonIsZeroVector(const float *vector, int v_size)
   return true;
 }
 
-void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
-                        const int8_t *input_to_gate_weights, int32_t n_batch, int32_t n_input,
-                        int32_t n_output, int32_t, int32_t *scratch, ruy::Context *ruy_context)
+inline void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
+                               const int8_t *input_to_gate_weights, int32_t n_batch,
+                               int32_t n_input, int32_t n_output, int32_t, int32_t *scratch,
+                               ruy::Context *ruy_context)
 {
   MatrixParams<int8_t> lhs_params;
   lhs_params.order = Order::kRowMajor;
   lhs_params.rows = n_output;
   lhs_params.cols = n_input;
-  lhs_params.cacheable = true;
+  lhs_params.cache_policy = CachePolicy::kAlwaysCache;
 
   MatrixParams<int8_t> rhs_params;
   rhs_params.order = Order::kColMajor;
@@ -574,19 +627,44 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
   ruy::Matrix<int8_t> ruy_lhs;
   ruy::Matrix<int8_t> ruy_rhs;
   ruy::Matrix<int32_t> ruy_dst;
-  ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs);
-  ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs);
+  // Note that cache is always enabled for input and weight tensors
+  ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true);
+  ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true);
   ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);
 
-  ruy::BasicSpec<int32_t, int32_t> ruy_spec;
-  ruy_support::MakeRuySpec(gemm_params, &ruy_spec);
+  ruy::MulParams<int32_t, int32_t> ruy_mul_params;
+  ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
 
-  constexpr ruy::Path kRuyPath = ruy::kAllPaths;
-  ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, ruy_context, &ruy_dst);
+  ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+}
+
+inline void NeonSub1Vector(const float *vector, int v_size, float *result)
+{
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
+
+  float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector)
+  {
+    // Load 4 float values from the current pointers of the input column and
+    // subtract from 1.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  for (; v < v_size; v++)
+  {
+    result[v] = 1.0f - vector[v];
+  }
 }
 
-void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
-                                 float *min, float *max, float *scaling_factor)
+inline void NeonSymmetricQuantizeFloats(const float *values, const int size,
+                                        int8_t *quantized_values, float *min, float *max,
+                                        float *scaling_factor)
 {
   // TODO(raziel): vectorize min/max calculation.
   auto minmax = std::minmax_element(values, values + size);
@@ -658,15 +736,16 @@ void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *qu
   for (int i = postamble_start; i < size; ++i)
   {
     const int32_t quantized_value =
-        static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
+      static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows,
-                                             const int m_cols, const int8_t *__restrict__ vectors,
-                                             const float *scaling_factors, int n_batch,
-                                             float *__restrict__ result, int result_stride)
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                    const int m_rows, const int m_cols,
+                                                    const int8_t *__restrict__ vectors,
+                                                    const float *scaling_factors, int n_batch,
+                                                    float *__restrict__ result, int result_stride)
 {
 #ifdef __aarch64__
   if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 && m_rows >= n_batch)
@@ -751,7 +830,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
         // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
         // performance may suffer significantly.
         assert( // NOLINT
-            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+          ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
         // Multiply the low bits (i.e. the lower 8 8bit numbers in the
@@ -776,7 +855,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
         // Here the assumption is that each buffer is 4-bytes aligned.
         // Otherwise, performance may suffer significantly.
         assert( // NOLINT
-            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+          ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
         const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
         const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
         const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
@@ -804,9 +883,9 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
   free(aligned_vec_free);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
-                                             const float *vector, int n_batch, float *result,
-                                             int result_stride)
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+                                                    const float *vector, int n_batch, float *result,
+                                                    int result_stride)
 {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
@@ -845,11 +924,12 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, in
   }
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows,
-                                             const int m_cols, const int8_t *__restrict__ vectors,
-                                             const float *scaling_factors, int n_batch,
-                                             int32_t *scratch, float *__restrict__ result,
-                                             int result_stride, ruy::Context *ruy_context)
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                    const int m_rows, const int m_cols,
+                                                    const int8_t *__restrict__ vectors,
+                                                    const float *scaling_factors, int n_batch,
+                                                    int32_t *scratch, float *__restrict__ result,
+                                                    int result_stride, ruy::Context *ruy_context)
 {
   if (m_rows % 4 == 0 && result_stride == 1)
   {
@@ -872,7 +952,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
       const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
       const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
       const float32x4_t result1 =
-          vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+        vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
       vst1q_f32(result, result0);
       vst1q_f32(result + 4 * result_stride, result1);
     }
diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h
index 54714e214..7e4b01a01 100644
--- a/compute/cker/include/cker/PortableTensorUtils.h
+++ b/compute/cker/include/cker/PortableTensorUtils.h
@@ -45,6 +45,10 @@ public:
         return a < 0.f ? 0.f : a;
       case FusedActivationFunctionType::kRelu6:
         return std::max(0.f, std::min(a, 6.f));
+      case FusedActivationFunctionType::kTanh:
+        return std::tanh(a);
+      case FusedActivationFunctionType::kSigmoid:
+        return 1.0f / (1.0f + std::exp(-a));
       default:
         // TODO(aselle): More informative fatal error!
         exit(1);
@@ -55,8 +59,17 @@ private:
   FusedActivationFunctionType act_;
 };
 
-void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
-                                     float *batch_vector)
+template <typename T>
+void PortableCwiseClipping(T *vector, const int v_size, const T clipping_value)
+{
+  for (int i = 0; i < v_size; i++)
+  {
+    vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value));
+  }
+}
+
+inline void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
+                                            float *batch_vector)
 {
   for (int b = 0; b < n_batch; b++)
   {
@@ -64,7 +77,20 @@ void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batc
   }
 }
 
-bool PortableIsZeroVector(const float *vector, int v_size)
+inline void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch,
+                                         float *batch_vector)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    for (int i = 0; i < v_size; ++i)
+    {
+      batch_vector[i] += vector[i];
+    }
+    batch_vector += v_size;
+  }
+}
+
+inline bool PortableIsZeroVector(const float *vector, int v_size)
 {
   for (int i = 0; i < v_size; ++i)
   {
@@ -74,8 +100,8 @@ bool PortableIsZeroVector(const float *vector, int v_size)
   return true;
 }
 
-void PortableApplyActivationToVector(const float *vector, int v_size,
-                                     FusedActivationFunctionType activation, float *result)
+inline void PortableApplyActivationToVector(const float *vector, int v_size,
+                                            FusedActivationFunctionType activation, float *result)
 {
   auto activation_func = ActivationFunctor(activation);
   for (int v = 0; v < v_size; v++)
@@ -84,8 +110,17 @@ void PortableApplyActivationToVector(const float *vector, int v_size,
   }
 }
 
-void PortableSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
-                                     float *min_value, float *max_value, float *scaling_factor)
+inline void PortableSub1Vector(const float *vector, int v_size, float *result)
+{
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ = 1.0f - *vector++;
+  }
+}
+
+inline void PortableSymmetricQuantizeFloats(const float *values, const int size,
+                                            int8_t *quantized_values, float *min_value,
+                                            float *max_value, float *scaling_factor)
 {
   auto minmax = std::minmax_element(values, values + size);
   *min_value = *minmax.first;
@@ -103,17 +138,72 @@ void PortableSymmetricQuantizeFloats(const float *values, const int size, int8_t
   for (int i = 0; i < size; ++i)
   {
     const int32_t quantized_value =
-        static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
+      static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
-                                                 const int m_rows, const int m_cols,
-                                                 const int8_t *__restrict__ vectors,
-                                                 const float *scaling_factors, int n_batch,
-                                                 float *__restrict__ result, int result_stride)
+inline void PortableAsymmetricQuantizeFloats(const float *values, const int size,
+                                             int8_t *quantized_values, float *scaling_factor,
+                                             int32_t *offset)
+{
+  /* Copied from TensorFlow PortableAsymmetricQuantizeFloats */
+  const int32_t kMinScale = -128;
+  const int32_t kMaxScale = 127;
+  const double qmin_double = kMinScale;
+  const double qmax_double = kMaxScale;
+  const auto minmax = std::minmax_element(values, values + size);
+  const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
+  const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
+  if (rmin == rmax)
+  {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    *offset = 0;
+    return;
+  }
+  else
+  {
+    double scale = (rmax - rmin) / (qmax_double - qmin_double);
+    const double zero_point_from_min = qmin_double - rmin / scale;
+    const double zero_point_from_max = qmax_double - rmax / scale;
+    const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(rmin / scale);
+    const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(rmax / scale);
+    const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error
+                                       ? zero_point_from_min
+                                       : zero_point_from_max;
+    int8_t nudged_zero_point = 0;
+    if (zero_point_double <= qmin_double)
+    {
+      nudged_zero_point = kMinScale;
+    }
+    else if (zero_point_double >= qmax_double)
+    {
+      nudged_zero_point = kMaxScale;
+    }
+    else
+    {
+      nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
+    }
+    *scaling_factor = scale;
+    *offset = nudged_zero_point;
+  }
+  const float scaling_factor_inv = 1.0f / *scaling_factor;
+  for (int i = 0; i < size; ++i)
+  {
+    const int32_t quantized_value =
+      static_cast<int32_t>(std::round(*offset + values[i] * scaling_factor_inv));
+    quantized_values[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
+  }
+}
+
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                        const int m_rows, const int m_cols,
+                                                        const int8_t *__restrict__ vectors,
+                                                        const float *scaling_factors, int n_batch,
+                                                        float *__restrict__ result,
+                                                        int result_stride)
 {
   int batch, row, col;
   for (batch = 0; batch < n_batch; ++batch, vectors += m_cols)
@@ -138,20 +228,20 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matr
   }   // for batch
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
-                                                 const int m_rows, const int m_cols,
-                                                 const int8_t *__restrict__ vector,
-                                                 const float *scaling_factors, int n_batch,
-                                                 int32_t *, float *__restrict__ result,
-                                                 int result_stride, ruy::Context *)
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                        const int m_rows, const int m_cols,
+                                                        const int8_t *__restrict__ vector,
+                                                        const float *scaling_factors, int n_batch,
+                                                        int32_t *, float *__restrict__ result,
+                                                        int result_stride, ruy::Context *)
 {
   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
                                               n_batch, result, result_stride);
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
-                                                 const float *vector, int n_batch, float *result,
-                                                 int result_stride)
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+                                                        const float *vector, int n_batch,
+                                                        float *result, int result_stride)
 {
   float *result_in_batch = result;
   for (int b = 0; b < n_batch; b++)
@@ -171,7 +261,36 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows
   }
 }
 
-void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); }
+inline void PortableMeanStddevNormalization(const float *input_vector, float *output_vector,
+                                            int v_size, int n_batch)
+{
+  for (int batch = 0; batch < n_batch; ++batch)
+  {
+    float sum = 0.0f;
+    for (int i = 0; i < v_size; ++i)
+    {
+      sum += input_vector[i];
+    }
+    const float mean = sum / v_size;
+    float sum_diff_sq = 0.0f;
+    for (int i = 0; i < v_size; ++i)
+    {
+      const float diff = input_vector[i] - mean;
+      sum_diff_sq += diff * diff;
+    }
+    const float variance = sum_diff_sq / v_size;
+    constexpr float kNormalizationConstant = 1e-8f;
+    const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant);
+    for (int i = 0; i < v_size; ++i)
+    {
+      output_vector[i] = (input_vector[i] - mean) * stddev_inv;
+    }
+    input_vector += v_size;
+    output_vector += v_size;
+  }
+}
+
+inline void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); }
 
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h
index 2486f01a6..9269ce9aa 100644
--- a/compute/cker/include/cker/Shape.h
+++ b/compute/cker/include/cker/Shape.h
@@ -136,12 +136,27 @@ public:
     std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
   }
 
+  inline void ReplaceWith(const Shape &other)
+  {
+    ReplaceWith(other.DimensionsCount(), other.DimsData());
+  }
+
+  inline void ReplaceWith(Shape &&other)
+  {
+    Resize(0);
+    std::swap(_size, other._size);
+    if (_size <= kMaxSmallSize)
+      std::copy(other._dims, other._dims + kMaxSmallSize, _dims);
+    else
+      _dims_pointer = other._dims_pointer;
+  }
+
   template <typename T> inline void BuildFrom(const T &src_iterable)
   {
     const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
     Resize(dimensions_count);
     int32_t *data = DimsData();
-    for (auto it : src_iterable)
+    for (auto &&it : src_iterable)
     {
       *data = it;
       ++data;
@@ -172,7 +187,6 @@ public:
     for (int i = 0; i < _size; i++)
     {
       const int dim = dims_data[i];
-      assert(dim >= 1);
       buffer_size *= dim;
     }
     return buffer_size;
diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h
index e07c91239..bac79b887 100644
--- a/compute/cker/include/cker/TensorUtils.h
+++ b/compute/cker/include/cker/TensorUtils.h
@@ -31,55 +31,133 @@ namespace nnfw
 namespace cker
 {
 
-void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector)
+inline void CwiseClipping(float *vector, const int v_size, const float clipping_value)
+{
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+
+inline void VectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector)
+{
+  PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
+}
+
+inline void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
+                                    float *batch_vector)
 {
   PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
 }
 
-bool IsZeroVector(const float *vector, int v_size)
+// Cwise product of two vectors.
+template <typename T>
+inline void VectorVectorCwiseProduct(const T *__restrict__ vector1, const T *__restrict__ vector2,
+                                     int v_size, T *__restrict__ result)
+{
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ = *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+template <typename T>
+inline void VectorVectorCwiseProductAccumulate(const T *__restrict__ vector1,
+                                               const T *__restrict__ vector2, int v_size,
+                                               T *__restrict__ result)
+{
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ += *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product of a vector and a batch-vector.
+template <typename T>
+inline void VectorBatchVectorCwiseProduct(const T *vector, int v_size, const T *batch_vector,
+                                          int n_batch, T *result)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+template <typename T>
+inline void VectorBatchVectorCwiseProductAccumulate(const T *vector, int v_size,
+                                                    const T *batch_vector, int n_batch, T *result)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+inline bool IsZeroVector(const float *vector, int v_size)
 {
   return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
 }
 
-void ApplyActivationToVector(const float *vector, int v_size,
-                             FusedActivationFunctionType activation, float *result)
+inline void ApplyActivationToVector(const float *vector, int v_size,
+                                    FusedActivationFunctionType activation, float *result)
 {
   PortableApplyActivationToVector(vector, v_size, activation, result);
 }
 
-void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
-                             float *min, float *max, float *scaling_factor)
+inline void Sub1Vector(const float *vector, int v_size, float *result)
+{
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+inline void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
+                                    float *min, float *max, float *scaling_factor)
 {
   return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max,
                           scaling_factor);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols,
-                                         const int8_t *vector, const float *scaling_factors,
-                                         int n_batch, float *result, int result_stride)
+inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows,
+                                                const int m_cols, const int8_t *vector,
+                                                const float *scaling_factors, int n_batch,
+                                                float *result, int result_stride)
 {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector,
                    scaling_factors, n_batch, result, result_stride);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
-                                         const float *vector, int n_batch, float *result,
-                                         int result_stride)
+inline void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+                                                const float *vector, int n_batch, float *result,
+                                                int result_stride)
 {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch,
                    result, result_stride);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols,
-                                         const int8_t *vectors, const float *scaling_factors,
-                                         int n_batch, int32_t *scratch, float *result,
-                                         int result_stride, ruy::Context *ruy_context)
+inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows,
+                                                const int m_cols, const int8_t *vectors,
+                                                const float *scaling_factors, int n_batch,
+                                                int32_t *scratch, float *result, int result_stride,
+                                                ruy::Context *ruy_context)
 {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors,
                    scaling_factors, n_batch, scratch, result, result_stride, ruy_context);
 }
 
-void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); }
+inline void MeanStddevNormalization(const float *input_vector, float *output_vector, int v_size,
+                                    int n_batch)
+{
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+inline void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); }
 
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index c0c9313ea..3fd0cf5b6 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -34,6 +34,8 @@ enum class FusedActivationFunctionType
   kRelu6 = 1,
   kRelu1 = 2,
   kRelu = 3,
+  kTanh = 4,
+  kSigmoid = 6,
 };
 enum class PaddingType
 {
@@ -78,8 +80,6 @@ enum class BroadcastableOpCategory : uint8_t
 
 struct PoolParams
 {
-  FusedActivationFunctionType activation;
-  PaddingType padding_type;
   PaddingValues padding_values;
   int stride_height;
   int stride_width;
@@ -109,6 +109,8 @@ struct SoftmaxParams
   int32_t zero_point;
   float scale;
   float *table;
+  uint8_t *uint8_table1;
+  uint8_t *uint8_table2;
 };
 
 struct PackParams
@@ -170,25 +172,25 @@ struct ComparisonParams
 struct BinaryArithmeticOpParam
 {
   // Shape dependent / common to data / op types.
-  BroadcastableOpCategory broadcast_category;
+  BroadcastableOpCategory broadcast_category{BroadcastableOpCategory::kNone};
   // uint8 inference params.
-  int32_t input1_offset;
-  int32_t input2_offset;
-  int32_t output_offset;
-  int32_t output_multiplier;
-  int32_t output_shift;
+  int32_t input1_offset = 0;
+  int32_t input2_offset = 0;
+  int32_t output_offset = 0;
+  int32_t output_multiplier = 0;
+  int32_t output_shift = 0;
   // Add / Sub, not Mul, uint8 inference params.
-  int32_t left_shift;
-  int32_t input1_multiplier;
-  int32_t input1_shift;
-  int32_t input2_multiplier;
-  int32_t input2_shift;
+  int32_t left_shift = 0;
+  int32_t input1_multiplier = 0;
+  int32_t input1_shift = 0;
+  int32_t input2_multiplier = 0;
+  int32_t input2_shift = 0;
   // uint8, etc, activation params.
-  int32_t quantized_activation_min;
-  int32_t quantized_activation_max;
+  int32_t quantized_activation_min = 0;
+  int32_t quantized_activation_max = 0;
   // float activation params.
-  float float_activation_min;
-  float float_activation_max;
+  float float_activation_min = 0;
+  float float_activation_max = 0;
 
   // Processed output dimensions.
   // Let input "a" be the one that broadcasts in the faster-changing dimension.
@@ -256,9 +258,12 @@ struct FullyConnectedParams
   // uint8, etc, activation params.
   int32_t quantized_activation_min;
   int32_t quantized_activation_max;
-  // float activation params.
+  // float activation params
   float float_activation_min;
   float float_activation_max;
+  // Mark the operands as cacheable if they are unchanging, e.g. weights.
+  bool lhs_cacheable;
+  bool rhs_cacheable;
   // FullyConnectedWeightsFormat weights_format;
 };
 
@@ -268,6 +273,27 @@ struct L2NormParams
   int32_t input_zero_point;
 };
 
+enum LSTMKernelType
+{
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+};
+
+struct LSTMParams
+{
+  // Parameters for LSTM version 1.
+  FusedActivationFunctionType activation{FusedActivationFunctionType::kNone};
+  float cell_clip;
+  float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  LSTMKernelType kernel_type;
+
+  // Parameters for LSTM version 4.
+  bool asymmetric_quantize_inputs;
+};
+
 struct GatherParams
 {
   int32_t axis;
@@ -366,12 +392,24 @@ struct SpaceToDepthParams
   int32_t block_size;
 };
 
+struct LeakyReluParams
+{
+  float alpha;
+};
+
 enum class Order
 {
   kColMajor,
   kRowMajor
 };
 
+enum class CachePolicy : std::uint8_t
+{
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
 // MatrixParams encapsulates the parameters that Gemm needs about each
 // matrix, besides the buffer data pointer.
 // Compare to ruy::Matrix, which also encapsulates the data pointer.
@@ -390,10 +428,13 @@ template <typename Scalar> struct MatrixParams
   // The zero_point, i.e. which Scalar value is to be interpreted as zero.
   // When Scalar is floating-point, this must be 0.
   Scalar zero_point = 0;
-  // Indicate whether the underlying data will remain unchanged for
-  // some period of time. Defaults to false, but should be set to true
-  // for unchanging data (e.g. weights buffers in many cases)
-  bool cacheable = false;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
 };
 
 // Enumeration of broad categories of Gemm.
@@ -442,9 +483,9 @@ enum class QuantizationFlavor
 // (only those that need perchannel quantization do).
 template <typename AccumScalar, typename DstScalar,
           QuantizationFlavor quantization_flavor =
-              std::is_floating_point<AccumScalar>::value
-                  ? QuantizationFlavor::kFloatingPoint
-                  : QuantizationFlavor::kIntegerWithUniformMultiplier>
+            std::is_floating_point<AccumScalar>::value
+              ? QuantizationFlavor::kFloatingPoint
+              : QuantizationFlavor::kIntegerWithUniformMultiplier>
 struct GemmParams
 {
   // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
@@ -471,12 +512,12 @@ struct GemmParams
   const AccumScalar *bias = nullptr;
   // min clamp bound of destination values.
   DstScalar clamp_min = std::is_floating_point<DstScalar>::value
-                            ? -std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::lowest();
+                          ? -std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::lowest();
   // max clamp bound of destination values.
   DstScalar clamp_max = std::is_floating_point<DstScalar>::value
-                            ? std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::max();
+                          ? std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::max();
 };
 
 // Validates self-consistency of GemmParams.
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index 2abb998d0..9aae0a957 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -20,6 +20,8 @@
 
 #include "Shape.h"
 
+#include "neon/neon_check.h"
+
 #include <algorithm>
 #include <cstdint>
 #include <fixedpoint/fixedpoint.h>
@@ -29,6 +31,11 @@ namespace nnfw
 namespace cker
 {
 
+template <typename T> struct is_quant8
+{
+  static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
+};
+
 template <typename T>
 inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
 {
@@ -88,8 +95,8 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
   return gemmlowp::RoundingDivideByPOT(
-      gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
-      right_shift);
+    gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+    right_shift);
 }
 
 inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
@@ -103,8 +110,36 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
                                                               int left_shift)
 {
   return gemmlowp::RoundingDivideByPOT(
-      gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+    gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+#ifdef USE_NEON
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
+                                                      int32_t quantized_multiplier, int32_t shift)
+{
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
+  int32x4x4_t result;
+
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[1] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[2] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[3] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  return result;
 }
+#endif
 
 inline int NodeOffset(int b, int h, int w, int height, int width)
 {
@@ -162,7 +197,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
   const F3 fixedpoint_input = F3::FromRaw(input >> 1);
   const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
   const F3 fixedpoint_half_three =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
   // Newton-Raphson iteration
   // Naive unoptimized starting guess: x = 1
   F3 x = F3::One();
@@ -173,7 +208,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
     x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
   }
   const F0 fixedpoint_half_sqrt_2 =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
   x = x * fixedpoint_half_sqrt_2;
   *output_inv_sqrt = x.raw();
   if (*output_shift < 0)
@@ -429,7 +464,7 @@ template <typename T> class SequentialTensorWriter
 {
 public:
   SequentialTensorWriter(const T *input_data, T *output_data)
-      : input_data_(input_data), output_ptr_(output_data)
+    : input_data_(input_data), output_ptr_(output_data)
   {
   }
 
diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h
index 49c34211a..e3b10990e 100644
--- a/compute/cker/include/cker/eigen/EigenSupport.h
+++ b/compute/cker/include/cker/eigen/EigenSupport.h
@@ -39,17 +39,17 @@ namespace eigen_support
 // library.
 typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    EigenMatrix;
+  EigenMatrix;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    ConstEigenMatrix;
+  ConstEigenMatrix;
 
 typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    EigenTensor;
+  EigenTensor;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    ConstEigenTensor;
+  ConstEigenTensor;
 
 // Utility functions we need for the EigenTensor API.
 template <typename Device, typename T> struct MatMulConvFunctor
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
index f9c706370..40cb85432 100644
--- a/compute/cker/include/cker/eigen/Utils.h
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -36,9 +36,9 @@ namespace cker
 //    Eigen::Map<Eigen::Matrix<const float, ...>>
 template <typename Scalar>
 using VectorMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+  std::is_const<Scalar>::value,
+  Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
+  Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
 
 template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape)
 {
@@ -51,10 +51,10 @@ template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Sha
 // above also applies here.
 template <typename Scalar>
 using MatrixMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
-                                   Eigen::Dynamic>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+  std::is_const<Scalar>::value,
+  Eigen::Map<
+    const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic>>,
+  Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
 
 template <typename Scalar>
 MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
index dc3e2552d..9d4fd2eaf 100644
--- a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
+++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
@@ -49,20 +49,19 @@ class TensorEvaluatorHasPartialPacket
 public:
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(
-      typename std::enable_if<
-          unpacket_traits<PacketT>::masked_load_available &&
-          std::is_same<
-              PacketT,
-              decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
-                  std::declval<IndexT>(),
-                  std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
-      -> std::true_type;
+    typename std::enable_if<
+      unpacket_traits<PacketT>::masked_load_available &&
+      std::is_same<PacketT,
+                   decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
+                     std::declval<IndexT>(),
+                     std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
+    -> std::true_type;
 
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(...) -> std::false_type;
 
   typedef decltype(
-      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
+    functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
 
   static constexpr bool value = status::value;
 };
@@ -71,9 +70,9 @@ public:
 // [from, to) range. If the mask bit is 1, element will be loaded/stored.
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
-                            typename unpacket_traits<Packet>::mask_t>::type
-    mask(int from, int to)
+  typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                          typename unpacket_traits<Packet>::mask_t>::type
+  mask(int from, int to)
 {
   const Index packet_size = internal::unpacket_traits<Packet>::size;
   eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
diff --git a/compute/cker/include/cker/eigen/eigen_gemm_eigen.h b/compute/cker/include/cker/eigen/eigen_gemm_eigen.h
new file mode 100644
index 000000000..d4f8fc09d
--- /dev/null
+++ b/compute/cker/include/cker/eigen/eigen_gemm_eigen.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__
+#define __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__
+
+// See b/131835803: in TFLite code, because eigen_spatial_convolutions.h does
+// #define Eigen EigenForTFLite, it is difficult to have any #include of Eigen
+// headers in a header file, as that results in name classes (compilation
+// errors) depending on the order in which these headers are #included.
+// So we have moved the #include of Eigen here, in a .cc file, where we have
+// control over the header #include sequence.
+// #include "third_party/eigen3/Eigen/Core"
+// #include "tensorflow/lite/kernels/cpu_backend_context.h"
+// #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+// #include "tensorflow/lite/kernels/internal/common.h"
+// #include "cker/eigen/eigen_convolution_helpers.h"
+#include "cker/operation/Common.h"
+#include "cker/Types.h"
+
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace detail
+{
+
+// tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h and cpu_backend_gemm_eigen.cc
+struct GemmImplUsingEigen
+{
+  static void Run(const MatrixParams<float> &lhs_params, const float *lhs_data,
+                  const MatrixParams<float> &rhs_params, const float *rhs_data,
+                  const MatrixParams<float> &dst_params, float *dst_data,
+                  const GemmParams<float, float> &params)
+  {
+    // This code assumes specific storage orders, encoded in these Eigen types.
+    // These assumptions have been checked by TF_LITE_ASSERT's in the public
+    // Gemm entry point already, before the implementation gets to this point.
+    using EigenMatrixMapRowMajorConst =
+      Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+    using EigenMatrixMapColMajorConst =
+      Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>;
+    using EigenMatrixMapColMajorMutable =
+      Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>;
+
+    EigenMatrixMapRowMajorConst eigen_lhs(lhs_data, lhs_params.rows, lhs_params.cols);
+    EigenMatrixMapColMajorConst eigen_rhs(rhs_data, rhs_params.rows, rhs_params.cols);
+    EigenMatrixMapColMajorMutable eigen_dst(dst_data, dst_params.rows, dst_params.cols);
+
+    if (rhs_params.cols == 1)
+    {
+      eigen_dst.col(0).noalias() = eigen_lhs * eigen_rhs.col(0);
+    }
+    else if (lhs_params.rows == 1)
+    {
+      eigen_dst.row(0).noalias() = eigen_lhs.row(0) * eigen_rhs;
+    }
+    else
+    {
+      eigen_dst.noalias() = eigen_lhs * eigen_rhs;
+    }
+
+    if (params.bias)
+    {
+      BiasAndClamp(params.clamp_min, params.clamp_max, dst_params.rows, params.bias,
+                   dst_params.rows * dst_params.cols, dst_data);
+    }
+    else
+    {
+      eigen_dst = eigen_dst.cwiseMin(params.clamp_max).cwiseMax(params.clamp_min);
+    }
+  }
+};
+
+} // namespace detail
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__
diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
index 92e1614d1..c931ac518 100644
--- a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
+++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
@@ -62,30 +62,27 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
-    Scalar_, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+  Scalar_, Index, Side,
+  TensorEvaluator<
+    const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+  nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
 {
 public:
   typedef Scalar_ Scalar;
 
   typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      Self;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    Self;
 
   typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
 
   typedef SubMapper VectorMapper;
   typedef SubMapper LinearMapper;
@@ -95,11 +92,11 @@ public:
 
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(
-      const TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device> &tensor,
-      const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
-      : m_impl(tensor.impl().impl())
+    const TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>
+      &tensor,
+    const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
+    : m_impl(tensor.impl().impl())
   {
     Index patch_rows;
     Index patch_depth;
@@ -167,7 +164,7 @@ public:
 
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper)
-      : m_impl(base_mapper.m_impl)
+    : m_impl(base_mapper.m_impl)
   {
     m_patch_cols = base_mapper.m_patch_cols;
     m_num_patches = base_mapper.m_num_patches;
@@ -280,11 +277,10 @@ public:
 
 private:
   friend class TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
 
   // Load coefficient from a patch specified by the "within patch offset"
   // (patchId) and the precomputed indices of the first element of the patch.
@@ -298,14 +294,14 @@ private:
     const Index colOffset = patchOffset / m_fastColStride;
     const Index inputCol = colIndex + colOffset * m_in_col_strides;
     const Index origInputCol = (m_patch_col_inflate_strides == 1)
-                                   ? inputCol
-                                   : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+                                 ? inputCol
+                                 : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
 
     const Index rowOffset = patchOffset - colOffset * m_colStride;
     const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
     const Index origInputRow = (m_patch_row_inflate_strides == 1)
-                                   ? inputRow
-                                   : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+                                 ? inputRow
+                                 : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
     if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
         origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) ||
         (inputRow != origInputRow * m_patch_row_inflate_strides))
@@ -314,7 +310,7 @@ private:
     }
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+      depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
     return m_impl.coeff(inputIndex);
   }
 
@@ -338,7 +334,7 @@ private:
     }
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
     return m_impl.coeff(inputIndex);
   }
 
@@ -390,7 +386,7 @@ private:
       // span[0] all the way upto (and including) span[1].
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex =
-          depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+        depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
       return m_impl.template partialPacket<Packet>(inputIndex - span[0],
                                                    mask<Packet>(span[0], span[1] + 1));
     }
@@ -445,10 +441,10 @@ private:
 
     // Load partial packets and do bit-wise OR to generate required packet
     return internal::por<Packet>(
-        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
-                                  patchOffsets2Cols[0], colOffsets[0]),
-        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
-                                  patchOffsets2Cols[1], colOffsets[1]));
+      loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
+                                patchOffsets2Cols[0], colOffsets[0]),
+      loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
+                                patchOffsets2Cols[1], colOffsets[1]));
   }
 
   // Helper function to load a packet that is present in a single columns.
@@ -477,7 +473,7 @@ private:
       // no padding
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex =
-          depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+        depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
       return m_impl.template packet<Unaligned>(inputIndex);
     }
     return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
@@ -490,7 +486,7 @@ private:
   // load.
   template <typename PacketT, typename TensorEvaluatorT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
   {
     const Index packetSize = internal::unpacket_traits<Packet>::size;
@@ -538,7 +534,7 @@ private:
   // packets.
   template <typename PacketT, typename TensorEvaluatorT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
   {
     const Index packetSize = internal::unpacket_traits<PacketT>::size;
@@ -604,7 +600,7 @@ private:
     // no padding
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
     return m_impl.template packet<Unaligned>(inputIndex);
   }
 
@@ -627,10 +623,10 @@ private:
   computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const
   {
     const size_t NumInputDims =
-        array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+      array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
     otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
     const Index patch2DIndex =
-        (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+      (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
     otherIndex *= m_patchInputStride;
     colIndex = patch2DIndex / m_fastOutputRows;
     rowIndex = patch2DIndex - colIndex * m_outputRows;
@@ -689,31 +685,28 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
-    Scalar, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+  Scalar, Index, Side,
+  TensorEvaluator<
+    const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+  nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
 {
 public:
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
   typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      ParentMapper;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    ParentMapper;
 
   typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      Self;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    Self;
 
   typedef Self LinearMapper;
 
@@ -722,16 +715,16 @@ public:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper,
                                                                    Index vert_offset,
                                                                    Index horiz_offset)
-      : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
+    : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
   {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper,
                                                                    Index vert_offset,
                                                                    Index horiz_offset)
-      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset),
-        m_base_mapper(base_mapper.m_base_mapper)
+    : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+      m_col_offset(horiz_offset + base_mapper.m_col_offset),
+      m_base_mapper(base_mapper.m_base_mapper)
   {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
@@ -766,7 +759,7 @@ public:
   {
     typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
     return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
-        i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+      i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; }
 
@@ -781,7 +774,7 @@ public:
   EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const
   {
     const Index max_col =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
+      (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
     return std::min<Index>(1 + max_col, patchCols());
   }
 
@@ -789,8 +782,8 @@ public:
   EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const
   {
     const Index max_row =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
-        fastPatchRowStride();
+      (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
+      fastPatchRowStride();
     return std::min<Index>(1 + max_row, patchRows());
   }
 
@@ -862,7 +855,7 @@ public:
   }
   template <typename PacketT = Packet>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const
   {
     const Index inputIndex = depth + baseIndex;
@@ -913,8 +906,8 @@ public:
 
     const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides;
     *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1)
-                    ? input_row
-                    : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
+                  ? input_row
+                  : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
 
     return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) ||
            (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides);
@@ -932,8 +925,8 @@ public:
 
     const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides;
     *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1)
-                    ? input_col
-                    : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
+                  ? input_col
+                  : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
 
     return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) ||
            (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides);
@@ -1033,23 +1026,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
           int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
@@ -1159,7 +1149,7 @@ struct gemm_pack_rhs<
               const Index idx3 = dm3.baseIndex(r, c);
 
               const Index start_depth =
-                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+                ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
               const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
               eigen_assert((max_depth - start_depth) % packet_size == 0);
 
@@ -1248,22 +1238,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
@@ -1378,7 +1366,7 @@ struct gemm_pack_rhs<
               const Index idx3 = dm3.baseIndex(r, c);
 
               const Index start_depth =
-                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+                ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
               const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
               eigen_assert((max_depth - start_depth) % packet_size == 0);
 
@@ -1472,22 +1460,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
 
   EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1582,27 +1568,25 @@ struct gemm_pack_rhs<
  */
 template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-    internal::traits<Input>::Layout == ColMajor,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const Kernel>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
-            const OutputKernel>>,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const Kernel>,
-            const OutputKernel>>>::type
+  internal::traits<Input>::Layout == ColMajor,
+  TensorReshapingOp<
+    const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+    const TensorContractionOp<
+      const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const Kernel>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+      const OutputKernel>>,
+  TensorReshapingOp<
+    const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+    const TensorContractionOp<
+      const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const Kernel>,
+      const OutputKernel>>>::type
 SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1,
                    const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME,
                    const Index row_in_stride = 1, const Index col_in_stride = 1,
@@ -1612,11 +1596,11 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions,
                    internal::traits<Input>::Layout, TensorIndex>>
-      in(input);
+    in(input);
   TensorRef<
-      Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
-             internal::traits<Kernel>::Layout, TensorIndex>>
-      kern(kernel);
+    Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
+           internal::traits<Kernel>::Layout, TensorIndex>>
+    kern(kernel);
 
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
                       YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1735,46 +1719,46 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
   }
   if (padding_explicit)
   {
-    return choose(
-        Cond<internal::traits<Input>::Layout == ColMajor>(),
-        kernel.reshape(kernel_dims)
-            .contract(input
-                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                                 row_in_stride, col_in_stride,
-                                                 /*row_inflate_stride=*/1,
-                                                 /*col_inflate_stride=*/1, padding_top,
-                                                 padding_bottom, padding_left, padding_right,
-                                                 /*padding_value=*/0)
-                          .reshape(pre_contract_dims),
-                      contract_dims, output_kernel)
-            .reshape(post_contract_dims),
-        input
-            .extract_image_patches(
-                kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride,
-                /*row_inflate_stride=*/1,
-                /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right,
-                /*padding_value=*/0)
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-            .reshape(post_contract_dims));
+    return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                  kernel.reshape(kernel_dims)
+                    .contract(input
+                                .extract_image_patches(kernelRows, kernelCols, row_stride,
+                                                       col_stride, row_in_stride, col_in_stride,
+                                                       /*row_inflate_stride=*/1,
+                                                       /*col_inflate_stride=*/1, padding_top,
+                                                       padding_bottom, padding_left, padding_right,
+                                                       /*padding_value=*/0)
+                                .reshape(pre_contract_dims),
+                              contract_dims, output_kernel)
+                    .reshape(post_contract_dims),
+                  input
+                    .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                           row_in_stride, col_in_stride,
+                                           /*row_inflate_stride=*/1,
+                                           /*col_inflate_stride=*/1, padding_top, padding_bottom,
+                                           padding_left, padding_right,
+                                           /*padding_value=*/0)
+                    .reshape(pre_contract_dims)
+                    .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+                    .reshape(post_contract_dims));
   }
   else
   {
     return choose(
-        Cond<internal::traits<Input>::Layout == ColMajor>(),
-        kernel.reshape(kernel_dims)
-            .contract(input
-                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                                 row_in_stride, col_in_stride, padding_type)
-                          .reshape(pre_contract_dims),
-                      contract_dims, output_kernel)
-            .reshape(post_contract_dims),
-        input
-            .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
-                                   col_in_stride, padding_type)
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-            .reshape(post_contract_dims));
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+        .contract(input
+                    .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                           row_in_stride, col_in_stride, padding_type)
+                    .reshape(pre_contract_dims),
+                  contract_dims, output_kernel)
+        .reshape(post_contract_dims),
+      input
+        .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
+                               col_in_stride, padding_type)
+        .reshape(pre_contract_dims)
+        .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+        .reshape(post_contract_dims));
   }
 }
 
diff --git a/compute/cker/include/cker/operation/AddN.h b/compute/cker/include/cker/operation/AddN.h
new file mode 100644
index 000000000..1704da641
--- /dev/null
+++ b/compute/cker/include/cker/operation/AddN.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ADDN_H__
+#define __NNFW_CKER_ADDN_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+void AddN(const Shape &input_shape, const size_t num_inputs, const T **input_data, T *output_data)
+{
+  const size_t size = input_shape.FlatSize();
+  for (size_t i = 0; i < size; ++i)
+  {
+    T x = 0;
+    for (size_t j = 0; j < num_inputs; ++j)
+    {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ADDN_H__
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
index 6149cafa7..e10f02ad4 100644
--- a/compute/cker/include/cker/operation/AveragePool.h
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -73,10 +73,10 @@ void AveragePool<float>(const PoolParams &params, const Shape &input_shape, cons
         int hpad = h + params.padding_values.height;
         int wpad = w + params.padding_values.width;
         int h_start =
-            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
         int h_end = std::min(hpad / stride_height + 1, output_height);
         int w_start =
-            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
         int w_end = std::min(wpad / stride_width + 1, output_width);
         // compute elementwise sum
         for (int ph = h_start; ph < h_end; ++ph)
@@ -146,11 +146,11 @@ inline void AveragePool16(const PoolParams &params, const Shape &input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           const int filter_count =
-              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
@@ -283,11 +283,11 @@ inline void AveragePool32(const PoolParams &params, const Shape &input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           const int filter_count =
-              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
@@ -395,6 +395,129 @@ void AveragePool<uint8_t>(const PoolParams &params, const Shape &input_shape,
   }
 }
 
+template <>
+void AveragePool<int8_t>(const PoolParams &params, const Shape &input_shape,
+                         const int8_t *input_data, const Shape &output_shape, int8_t *output_data)
+{
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  assert(params.quantized_activation_min <= params.quantized_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  int32_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+    {
+      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const int8_t *input_ptr =
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++)
+          {
+            const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++)
+            {
+              const int8_t *input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16)
+              {
+                int16x4_t acc_reg[4];
+                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
+                acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
+                for (int i = 0; i < 4; i++)
+                {
+                  vst1q_s32(acc + channel + 4 * i,
+                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8)
+              {
+                int16x4_t acc_reg[2];
+                int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_s16(input_reg);
+                acc_reg[1] = vget_high_s16(input_reg);
+                for (int i = 0; i < 2; i++)
+                {
+                  vst1q_s32(acc + channel + 4 * i,
+                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel)
+              {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 8; channel += 8)
+          {
+            int16_t buf[8];
+            for (int i = 0; i < 8; i++)
+            {
+              buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count
+                                            : (acc[channel + i] - filter_count / 2) / filter_count;
+            }
+            int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
+            buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
+            buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
+            vst1_s8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel)
+          {
+            int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count
+                                         : (acc[channel] - filter_count / 2) / filter_count;
+            a = std::max<int16_t>(a, params.quantized_activation_min);
+            a = std::min<int16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<int8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h
index e33b2fba5..980ad48dd 100644
--- a/compute/cker/include/cker/operation/BatchToSpaceND.h
+++ b/compute/cker/include/cker/operation/BatchToSpaceND.h
@@ -43,7 +43,7 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_
   // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
   // end_index is exclusive).
   *end_index =
-      std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+    std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
 }
 
 template <typename T>
@@ -116,7 +116,7 @@ inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1
       for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
       {
         const int out_w =
-            in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
+          in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
         assert(out_w >= 0);
         assert(out_w < output_width);
         T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index 8aef1f8c1..c7878496a 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -139,7 +139,7 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
   // From this point it is assumed contractually that corresponding dimensions
   // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
   const bool swap_inputs =
-      params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
+    params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
   const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
   const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
 
@@ -190,34 +190,34 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
 }
 
 template <BinaryArithmeticOpType op_type, typename T>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                               const T *input1_data, const Shape &input2_shape,
-                               const T *input2_data, const Shape &output_shape, T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                   const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                   const Shape &output_shape, T *output_data)
 {
   reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                 output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
 }
 
-template <BinaryArithmeticOpType op_type>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                               const uint8_t *input1_data, const Shape &input2_shape,
-                               const uint8_t *input2_data, const Shape &output_shape,
-                               uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                   const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                   const Shape &output_shape, T *output_data)
 {
   switch (op_type)
   {
     case nnfw::cker::BinaryArithmeticOpType::ADD:
     case nnfw::cker::BinaryArithmeticOpType::SUB:
-      optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data,
-                           output_shape, output_data);
+      optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
-      optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-                           const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
       throw std::runtime_error{"Quant8 Asymm NYI"};
-
     default:
       assert(false);
       break;
@@ -246,9 +246,8 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
                      output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
-      reference::BinaryArithmeticOp<float>(params, input1_shape, input1_data, input2_shape,
-                                           input2_data, output_shape, output_data,
-                                           GetBinaryArtithmeticFn<op_type, float>());
+      optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     default:
       assert(false);
@@ -257,33 +256,32 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
 }
 
 template <BinaryArithmeticOpType op_type, typename T>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                                        const T *input1_data, const Shape &input2_shape,
-                                        const T *input2_data, const Shape &output_shape,
-                                        T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                            const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                            const Shape &output_shape, T *output_data)
 {
   reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                              input2_data, output_shape, output_data,
                                              GetBinaryArtithmeticFn<op_type, T>());
 }
 
-template <BinaryArithmeticOpType op_type>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                                        const uint8_t *input1_data, const Shape &input2_shape,
-                                        const uint8_t *input2_data, const Shape &output_shape,
-                                        uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                            const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                            const Shape &output_shape, T *output_data)
 {
   switch (op_type)
   {
     case nnfw::cker::BinaryArithmeticOpType::ADD:
     case nnfw::cker::BinaryArithmeticOpType::SUB:
-      optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape,
-                                            input2_data, output_shape, output_data);
+      optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
-      optimized::BroadcastMulDispatchQuant8(
-          params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-          const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
     case nnfw::cker::BinaryArithmeticOpType::POW:
@@ -312,11 +310,17 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
                                       output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::SUB:
+      optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
+      optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
     case nnfw::cker::BinaryArithmeticOpType::POW:
       reference::BroadcastBinaryArithmeticOpSlow<float>(
-          params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-          GetBinaryArtithmeticFn<op_type, float>());
+        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+        GetBinaryArtithmeticFn<op_type, float>());
       break;
     default:
       assert(false);
diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h
index 5068eca96..145deda29 100644
--- a/compute/cker/include/cker/operation/BroadcastTo.h
+++ b/compute/cker/include/cker/operation/BroadcastTo.h
@@ -126,7 +126,7 @@ template <typename Device, typename T> struct BroadcastTo
     }
   }
 };
-} // functor
+} // namespace functor
 
 template <typename T>
 inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape,
diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h
index d69b38aca..24d4cc4c7 100644
--- a/compute/cker/include/cker/operation/Common.h
+++ b/compute/cker/include/cker/operation/Common.h
@@ -82,7 +82,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
     for (; i < bias_size; i++)
     {
       array_ptr[i] =
-          ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
+        ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
     }
   }
 #else // not NEON
@@ -91,7 +91,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
     for (int i = 0; i < bias_size; i++)
     {
       array_data[array_offset + i] = ActivationFunctionWithMinMax(
-          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+        array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
     }
   }
 #endif
diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h
index 47eb6034c..ac6af8487 100644
--- a/compute/cker/include/cker/operation/Comparison.h
+++ b/compute/cker/include/cker/operation/Comparison.h
@@ -42,7 +42,7 @@ inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data,
                            const Shape &output_shape, bool *output_data)
 {
   const int64_t flatsize = // number of data....
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+    MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i)
   {
     output_data[i] = F(input1_data[i], input2_data[i]);
@@ -79,9 +79,9 @@ inline void ComparisonWithScaling(ComparisonParams &params, const Shape &input1_
     const int32_t shifted_input1_val = input1_val * (1 << left_shift);
     const int32_t shifted_input2_val = input2_val * (1 << left_shift);
     const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, input1_multiplier, input1_shift);
+      shifted_input1_val, input1_multiplier, input1_shift);
     const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, input2_multiplier, input2_shift);
+      shifted_input2_val, input2_multiplier, input2_shift);
     output_data[i] = F(scaled_input1_val, scaled_input2_val);
   }
 }
@@ -111,8 +111,8 @@ BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *inp
         for (int c = 0; c < output_shape.Dims(3); ++c)
         {
           output_data[Offset(output_shape, b, y, x, c)] =
-              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+            F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
         }
       }
     }
@@ -159,15 +159,15 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
         for (int c = 0; c < output_shape.Dims(3); ++c)
         {
           const int32_t input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+            input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
           const int32_t input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+            input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
           const int32_t shifted_input1_val = input1_val * (1 << left_shift);
           const int32_t shifted_input2_val = input2_val * (1 << left_shift);
           const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-              shifted_input1_val, input1_multiplier, input1_shift);
+            shifted_input1_val, input1_multiplier, input1_shift);
           const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-              shifted_input2_val, input2_multiplier, input2_shift);
+            shifted_input2_val, input2_multiplier, input2_shift);
           output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
         }
       }
@@ -175,55 +175,53 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
   }
 }
 
-#define TFLITE_COMPARISON_OP(name)                                                                \
-  template <typename T>                                                                           \
-  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,    \
-                   const T *input2_data, const Shape &output_shape, bool *output_data)            \
-  {                                                                                               \
-    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,      \
-                         output_data);                                                            \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                    \
-                              const Shape &input2_shape, const T *input2_data,                    \
-                              const Shape &output_shape, bool *output_data)                       \
-  {                                                                                               \
-    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,             \
-                                output_shape, output_data);                                       \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void name##WithScaling(ComparisonParams &params, const Shape &input1_shape,              \
-                                const T *input1_data, const Shape &input2_shape,                  \
-                                const T *input2_data, const Shape &output_shape,                  \
-                                bool *output_data)                                                \
-  {                                                                                               \
-    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,           \
-                                       input2_data, output_shape, output_data);                   \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,   \
-                                               const Shape &input2_shape, const T *input2_data,   \
-                                               const Shape &output_shape, bool *output_data)      \
-  {                                                                                               \
-    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,           \
-                                               input2_data, output_shape, output_data);           \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,              \
-                                    const Shape &input2_shape, const T *input2_data,              \
-                                    const Shape &output_shape, bool *output_data)                 \
-  {                                                                                               \
-    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,  \
-                                           output_shape, output_data);                            \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name##WithScaling(ComparisonParams &params,                        \
-                                                 const Shape &input1_shape, const T *input1_data, \
-                                                 const Shape &input2_shape, const T *input2_data, \
-                                                 const Shape &output_shape, bool *output_data)    \
-  {                                                                                               \
-    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                            \
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \
+#define TFLITE_COMPARISON_OP(name)                                                                 \
+  template <typename T>                                                                            \
+  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,     \
+                   const T *input2_data, const Shape &output_shape, bool *output_data)             \
+  {                                                                                                \
+    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,       \
+                         output_data);                                                             \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                     \
+                              const Shape &input2_shape, const T *input2_data,                     \
+                              const Shape &output_shape, bool *output_data)                        \
+  {                                                                                                \
+    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,              \
+                                output_shape, output_data);                                        \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void name##WithScaling(                                                                   \
+    ComparisonParams &params, const Shape &input1_shape, const T *input1_data,                     \
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+  {                                                                                                \
+    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,            \
+                                       input2_data, output_shape, output_data);                    \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,    \
+                                               const Shape &input2_shape, const T *input2_data,    \
+                                               const Shape &output_shape, bool *output_data)       \
+  {                                                                                                \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,            \
+                                               input2_data, output_shape, output_data);            \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,               \
+                                    const Shape &input2_shape, const T *input2_data,               \
+                                    const Shape &output_shape, bool *output_data)                  \
+  {                                                                                                \
+    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,   \
+                                           output_shape, output_data);                             \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name##WithScaling(                                                  \
+    ComparisonParams &params, const Shape &input1_shape, const T *input1_data,                     \
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+  {                                                                                                \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                             \
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data);    \
   }
 
 TFLITE_COMPARISON_OP(Equal);
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
index 394123e30..9aaca00b7 100644
--- a/compute/cker/include/cker/operation/Concatenation.h
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -142,7 +142,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams &params,
         for (int j = 0; j < copy_size; ++j)
         {
           const int32_t value =
-              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
           output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
         }
       }
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
index 214f2e612..2572b51ee 100644
--- a/compute/cker/include/cker/operation/Conv.h
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -57,9 +57,9 @@ class Conv
 public:
   Conv() : _modified_filter_data(), _im2col_shape(4), _need_im2col(false), _prepared(false) {}
 
-  void prepare(const Shape &filter_shape, const float *filter_data, PaddingType padding_type,
-               bool &is_replaced_weights, uint32_t dilationWidthFactor,
-               uint32_t dilationHeightFactor)
+  void prepareF32(const Shape &filter_shape, const float *filter_data, PaddingType padding_type,
+                  bool &is_replaced_weights, uint32_t dilationWidthFactor,
+                  uint32_t dilationHeightFactor)
   {
     if (!_prepared)
     {
@@ -71,12 +71,14 @@ public:
     }
   }
 
-  void prepareQuant(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape,
-                    uint32_t stride_width, uint32_t stride_height)
+  void prepareQ8uPerTensor(const Shape &input_shape, const Shape &kernel_shape,
+                           const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+                           uint32_t dilation_width_factor, uint32_t dilation_height_factor)
   {
     if (!_prepared)
     {
-      IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height);
+      IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height,
+                       dilation_width_factor, dilation_height_factor);
       _prepared = true;
     }
   }
@@ -115,7 +117,8 @@ public:
     {
       // This means that input or output are dynamic or filter is not constant
       IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width,
-                       params.stride_height);
+                       params.stride_height, params.dilation_width_factor,
+                       params.dilation_height_factor);
     }
 
     int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 1;
@@ -135,6 +138,29 @@ public:
     }
   }
 
+  void operator()(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+                  const Shape &filter_shape, const uint8_t *filter_data,
+                  const int32_t *filter_zero_point, const Shape &bias_shape,
+                  const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+  {
+    reference::Conv<uint8_t, true>(params, _per_channel_output_multiplier.data(),
+                                   _per_channel_output_shift.data(), input_shape, input_data,
+                                   filter_shape, filter_data, filter_zero_point, bias_shape,
+                                   bias_data, output_shape, output_data);
+  }
+
+  void operator()(const ConvParams &params, const Shape &input_shape, const int8_t *input_data,
+                  const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape,
+                  const int32_t *bias_data, const Shape &output_shape, int8_t *output_data)
+  {
+    reference::Conv<int8_t, false>(params, _per_channel_output_multiplier.data(),
+                                   _per_channel_output_shift.data(), input_shape, input_data,
+                                   filter_shape, filter_data, nullptr /* filter_zero_point */,
+                                   bias_shape, bias_data, output_shape, output_data);
+  }
+  std::vector<int32_t> &per_channel_output_multiplier() { return _per_channel_output_multiplier; }
+  std::vector<int> &per_channel_output_shift() { return _per_channel_output_shift; }
+
 private:
   bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor,
                            int32_t dilation_height_factor)
@@ -154,10 +180,15 @@ private:
   }
 
   void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape,
-                        const Shape &output_shape, uint32_t stride_width, uint32_t stride_height)
+                        const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+                        uint32_t dilation_width_factor, uint32_t dilation_height_factor)
   {
-    _need_im2col = stride_width != 1 || stride_height != 1 || kernel_shape.Dims(1) != 1 ||
-                   kernel_shape.Dims(2) != 1;
+    const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+    const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 ||
+                                         kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1;
+
+    _need_im2col = need_dilated_im2col || need_non_dilated_im2col;
+
     if (_need_im2col)
     {
       _im2col_shape.SetDim(0, output_shape.Dims(0));
@@ -172,7 +203,25 @@ private:
   Shape _im2col_shape;
   bool _need_im2col;
   bool _prepared;
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> _per_channel_output_multiplier;
+  std::vector<int> _per_channel_output_shift;
+};
+
+struct ConvHybridTempArena
+{
+  ConvHybridTempArena(int batch_size, int input_size)
+  {
+    input_quantized.resize(input_size);
+    // TODO: Optimize the case of batch_size = 1
+    input_scaling_factors.resize(batch_size);
+    input_offsets.resize(batch_size);
+  }
+  std::vector<int8_t> input_quantized;
+  std::vector<float> input_scaling_factors;
+  std::vector<int32_t> input_offsets;
 };
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/DepthToSpace.h b/compute/cker/include/cker/operation/DepthToSpace.h
new file mode 100644
index 000000000..e57fef01d
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthToSpace.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTH_TO_SPACE_H__
+#define __NNFW_CKER_DEPTH_TO_SPACE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void DepthToSpace(const Shape &unextended_input_shape, const T *input_data,
+                         const Shape &unextended_output_shape, T *output_data, int32_t block_size)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+
+  const int output_depth = output_shape.Dims(3);
+  const int batch_size = output_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch)
+  {
+    for (int in_h = 0; in_h < input_height; ++in_h)
+    {
+      const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h)
+      {
+        const T *src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
index 814a9e019..c926ec4f1 100644
--- a/compute/cker/include/cker/operation/DepthwiseConv.h
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -22,143 +22,162 @@
 #include "cker/Types.h"
 #include "cker/Utils.h"
 #include "cker/neon/neon_check.h"
+#include "cker/operation/optimized/DepthwiseConvFloat.h"
 #include "cker/operation/optimized/DepthwiseConvUint8.h"
+#include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
+#include "cker/operation/reference/integer_ops/DepthwiseConvUInt8.h"
+#include "cker/operation/reference/integer_ops/DepthwiseConvHybrid.h"
+#include "cker/CpuBackendThreadpool.h"
 
 namespace nnfw
 {
 namespace cker
 {
 
-inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
-                          const uint8_t *input_data, const Shape &filter_shape,
-                          const uint8_t *filter_data, const Shape &bias_shape,
-                          const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+// TODO(luwa): add multithread to per-channel depthwise_conv
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
 {
-  const int depth_multiplier = params.depth_multiplier;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  assert(dilation_width_factor >= 1);
-  assert(dilation_height_factor >= 1);
-  UNUSED_RELEASE(dilation_width_factor);
-  UNUSED_RELEASE(dilation_height_factor);
-  assert(input_shape.DimensionsCount() == 4);
-  assert(filter_shape.DimensionsCount() == 4);
-  assert(output_shape.DimensionsCount() == 4);
-  assert(output_activation_min <= output_activation_max);
-  UNUSED_RELEASE(output_activation_min);
-  UNUSED_RELEASE(output_activation_max);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_depth = input_shape.Dims(3);
-  assert(output_depth == input_depth * depth_multiplier);
-  assert(bias_shape.FlatSize() == output_depth);
-  UNUSED_RELEASE(input_depth);
-  UNUSED_RELEASE(output_depth);
-  UNUSED_RELEASE(depth_multiplier);
-
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__aarch64__)
-//  TODO Use below codes
-
-//  const int stride_width = params.stride_width;
-//  const int stride_height = params.stride_height;
-//  const int pad_width = params.padding_values.width;
-//  const int pad_height = params.padding_values.height;
-//  const int output_shift = params.output_shift;
-//
-//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
-//  // parameters are supported.
-//  if (Fast3x3FilterKernelSupported(
-//          input_shape, filter_shape, stride_width, stride_height,
-//          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
-//          depth_multiplier, output_shape, output_shift)) {
-//    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-//                           filter_data, bias_shape, bias_data, output_shape,
-//                           output_data);
-//    return;
-//  }
-#endif
-
-  optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
-                                  bias_shape, bias_data, output_shape, output_data);
+  DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape,
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, int thread_start, int thread_end, int thread_dim)
+    : params_(params), input_shape_(input_shape), input_data_(input_data),
+      filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
+      bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
+      thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
+  {
+  }
+
+  void Run() override
+  {
+    optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
+                                 bias_shape_, bias_data_, output_shape_, output_data_,
+                                 thread_start_, thread_end_, thread_dim_);
+  }
+
+private:
+  const DepthwiseConvParams &params_;
+  const Shape &input_shape_;
+  const T *input_data_;
+  const Shape &filter_shape_;
+  const T *filter_data_;
+  const Shape &bias_shape_;
+  const TS *bias_data_;
+  const Shape &output_shape_;
+  T *output_data_;
+  // const CpuFlags& cpu_flags_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
+{
+  // How many scalar multiplications are needed to make it worth using one
+  // more thread
+  static constexpr int kMinMulPerThread = 1 << 13; // 8k
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
+  // Try to avoid real runtime divisions if possible by dividing by a
+  // compile-time constant.
+  int thread_count = std::max(1, num_muls / kMinMulPerThread);
+  return thread_count;
+}
+
+inline bool MultithreadAlongBatches(int thread_count, int batches)
+{
+  assert(thread_count >= 2);
+  // If there are fewer batch entries than the number of threads we want to use,
+  // then better do intra-batch-entry multithreading.
+  if (batches < thread_count)
+  {
+    return false;
+  }
+  // If there are at least 2 batch entries to be handed to each thread, then
+  // it's safe to proceed with batch-wise multithreading: each thread will have
+  // approximately equal number of batch entries to handle, so the load
+  // balancing will be reasonable, and the amount to which the load is not
+  // perfectly balanced will be offset by the inherent advantages of
+  // batch-wise multithreading (each thread is more efficient thanks to working
+  // on larger buffers with less boundary-handling overhead).
+  if (batches >= 2 * thread_count)
+  {
+    return true;
+  }
+  // In the limit case were there are at least 1 but not much more than 1
+  // batch entries per thread, it may be a good idea to do per-batch
+  // multithreading if the number of batch entries is a multiple of the number
+  // of threads, so that each thread will have the same number of batch entries
+  // to process.
+  return ((batches % thread_count) == 0);
 }
 
+template <typename T, typename TS>
 inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
-                          const float *input_data, const Shape &filter_shape,
-                          const float *filter_data, const Shape &bias_shape, const float *bias_data,
-                          const Shape &output_shape, float *output_data)
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, ruy::Context *ruy_context)
 {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
   assert(input_shape.DimensionsCount() == 4);
   assert(filter_shape.DimensionsCount() == 4);
   assert(output_shape.DimensionsCount() == 4);
 
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
+  int thread_count = HowManyConvThreads(output_shape, filter_shape);
+
+  // NOTE Borrow RuyContext to get max_num_threads setting
+  // TODO Define and use max_num_threads for CPU backend
+  const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
+
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+  // Cap the number of threads to 2 for float path to avoid regression in
+  // performance (b/132294857).
+  if (std::is_floating_point<T>::value)
+  {
+    thread_count = std::min(thread_count, 2);
+  }
+
+  const int output_batches = output_shape.Dims(0);
   const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  assert(output_depth == input_depth * depth_multiplier);
-  assert(bias_shape.FlatSize() == output_depth);
-  UNUSED_RELEASE(output_depth);
-  UNUSED_RELEASE(bias_shape);
 
-  for (int b = 0; b < batches; ++b)
+  if (thread_count == 1)
+  {
+    optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
+                                 bias_shape, bias_data, output_shape, output_data, 0, output_height,
+                                 1);
+    return;
+  }
+
+  int thread_dim, thread_dim_size;
+  if (MultithreadAlongBatches(thread_count, output_batches))
+  {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+  }
+  else
+  {
+    thread_dim = 1;
+    thread_dim_size = output_height;
+  }
+
+  std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  int thread_start = 0;
+  for (int i = 0; i < thread_count; ++i)
   {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int ic = 0; ic < input_depth; ++ic)
-        {
-          for (int m = 0; m < depth_multiplier; m++)
-          {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            float total = 0.f;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
-            {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
-              {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
-                {
-                  float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
-                  total += (input_value * filter_value);
-                }
-              }
-            }
-            float bias_value = 0.0f;
-            if (bias_data)
-            {
-              bias_value = bias_data[oc];
-            }
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
-                total + bias_value, output_activation_min, output_activation_max);
-          }
-        }
-      }
-    }
+    int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+    tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+                       bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+    thread_start = thread_end;
   }
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
 }
 
 } // namespace cker
diff --git a/compute/cker/include/cker/operation/Dequantize.h b/compute/cker/include/cker/operation/Dequantize.h
new file mode 100644
index 000000000..c8c2fd9d4
--- /dev/null
+++ b/compute/cker/include/cker/operation/Dequantize.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEQUANTIZE_H__
+#define __NNFW_CKER_DEQUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+#ifdef USE_NEON
+namespace
+{
+inline void ScaleWithNewZeroPoint(const int32x4_t input, const float32x4_t scale_dup,
+                                  const float32x4_t zero_times_scale_dup, float32x4_t *output)
+{
+#ifdef __ARM_FEATURE_FMA
+  *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
+#else
+  *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup), zero_times_scale_dup);
+#endif
+}
+} // namespace
+#endif // USE_NEON
+
+inline void Dequantize(const Shape &input_shape, const uint8_t *input_data,
+                       const Shape &output_shape, float *output_data, const float scale,
+                       const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const uint8x8_t input_u8 = vld1_u8(input_data + i);
+    const uint16x8_t input_u16 = vmovl_u8(input_u8);
+    const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif // NEON
+  for (; i < flat_size; ++i)
+  {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const Shape &input_shape, const int8_t *input_data,
+                       const Shape &output_shape, float *output_data, const float scale,
+                       const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const int8x8_t input_s8 = vld1_s8(input_data + i);
+    const int16x8_t input_s16 = vmovl_s8(input_s8);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif // NEON
+  for (; i < flat_size; ++i)
+  {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const Shape &input_shape, const int16_t *input_data,
+                       const Shape &output_shape, float *output_data, const float scale,
+                       const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const int16x4_t input_s16_low = vld1_s16(input_data + i);
+    const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif // NEON
+  for (; i < flat_size; ++i)
+  {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_DEQUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/ELU.h b/compute/cker/include/cker/operation/ELU.h
new file mode 100644
index 000000000..6bdd7c62e
--- /dev/null
+++ b/compute/cker/include/cker/operation/ELU.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ELU_H__
+#define __NNFW_CKER_ELU_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ELU(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ELU_H__
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
index 3d1837f47..bb9f88f8d 100644
--- a/compute/cker/include/cker/operation/Einsum.h
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -177,7 +177,7 @@ inline Shape copyShape(const Shape &shape)
 {
   return Shape::ExtendedShape(shape.DimensionsCount(), shape);
 }
-}
+} // namespace
 
 class Einsum
 {
@@ -274,7 +274,7 @@ public:
     }
     for (int i = 0; i < num_inputs; ++i)
     {
-      for (int label : free_labels[i])
+      for (auto &&label : free_labels[i])
       {
         result_labels.push_back(label);
         result_shape_dims.push_back(label_to_dim_sizes[label]);
@@ -300,7 +300,7 @@ public:
     {
       // We inflated the output. Modify result labels accordingly.
       Labels inflated_labels;
-      for (int label : result_labels)
+      for (auto &&label : result_labels)
       {
         inflated_labels.insert(inflated_labels.end(), output_label_counts[label], label);
       }
@@ -394,8 +394,8 @@ private:
     for (int label = 0; label < num_labels; ++label)
     {
       bool removed = (_output_label_counts[label] == 0);
-      bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 ||
-                    _input_label_counts[1][label] == 0;
+      bool unique =
+        num_inputs == 1 || _input_label_counts[0][label] == 0 || _input_label_counts[1][label] == 0;
       _label_types[label] = getDimensionType(removed, unique);
     }
   }
@@ -483,8 +483,8 @@ private:
       if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size())
       {
         throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " +
-                                 std::to_string(labels->size() - 1) + " but got: " +
-                                 std::to_string(inputs[i].shape.DimensionsCount())};
+                                 std::to_string(labels->size() - 1) +
+                                 " but got: " + std::to_string(inputs[i].shape.DimensionsCount())};
       }
       int ellipsis_axis = -1;
       const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1;
@@ -511,7 +511,7 @@ private:
     }
 
     std::vector<bool>::iterator it_input =
-        std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
+      std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
     if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis)
     {
       return;
@@ -645,11 +645,11 @@ private:
 
     // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor.
     const int32_t output_size =
-        reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
+      reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
     functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce(
-        device, output->shaped<T, 1>({output_size}),
-        input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
-        Reducer());
+      device, output->shaped<T, 1>({output_size}),
+      input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
+      Reducer());
   }
 
   bool shouldSwapFreeAndContract(const Labels &labels,
@@ -775,11 +775,11 @@ private:
     Shape inflated_shape;
     std::vector<int32_t> strided_shape_dims;
     std::vector<int32_t> inflated_shape_dims;
-    for (int label : labels)
+    for (auto &&label : labels)
     {
       const int32_t count = label_counts[label];
       const int current_axis =
-          should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
+        should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
       const int32_t dim = input.shape.Dims(current_axis);
       strided_shape_dims.push_back(dim);
       inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim);
@@ -879,7 +879,7 @@ private:
     for (size_t i = 0; i < inputs.size(); ++i)
     {
       const int32_t free_axis =
-          inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
+        inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
       output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis));
     }
     bool adj_x = swap_free_and_contract[0];
diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h
index 598a032bb..0e980f18e 100644
--- a/compute/cker/include/cker/operation/Elementwise.h
+++ b/compute/cker/include/cker/operation/Elementwise.h
@@ -66,8 +66,9 @@ inline void Rsqrt(const Shape &input_shape, const float *input_data, const Shape
   }
 }
 
-inline void Neg(const Shape &input_shape, const float *input_data, const Shape &output_shape,
-                float *output_data)
+template <typename T>
+inline void Neg(const Shape &input_shape, const T *input_data, const Shape &output_shape,
+                T *output_data)
 {
   const int size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < size; i++)
@@ -86,6 +87,39 @@ inline void Log(const Shape &input_shape, const float *input_data, const Shape &
   }
 }
 
+inline void Floor(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                  float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = std::floor(input_data[i]);
+  }
+}
+
+inline void Sqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                 float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = std::sqrt(input_data[i]);
+  }
+}
+
+inline void Square(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                   float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input_data[i] * input_data[i];
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
index 14daf9839..f88c3a5fb 100644
--- a/compute/cker/include/cker/operation/Fill.h
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -25,26 +25,12 @@ namespace nnfw
 namespace cker
 {
 template <typename T>
-inline void Fill(const Shape &input_shape, int *input_data, const T value_data,
-                 const Shape &output_shape, T output_data)
+inline void Fill(const T *value_data, const Shape &output_shape, T *output_data)
 {
-  int input_size = input_shape.FlatSize();
-  int output_size = 1;
-  for (int i = 0; i < input_size; i++)
+  int output_size = output_shape.FlatSize();
+  for (int i = 0; i < output_size; i++)
   {
-    output_size *= input_data[i];
-  }
-
-  if (output_size == output_shape.FlatSize())
-  {
-    for (int i = 0; i < output_size; i++)
-    {
-      output_data[i] = *value_data;
-    }
-  }
-  else
-  {
-    throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output");
+    output_data[i] = *value_data;
   }
 }
 
diff --git a/compute/cker/include/cker/operation/FloorDiv.h b/compute/cker/include/cker/operation/FloorDiv.h
new file mode 100644
index 000000000..cdb2c2a8b
--- /dev/null
+++ b/compute/cker/include/cker/operation/FloorDiv.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FLOOR_DIV_H__
+#define __NNFW_CKER_FLOOR_DIV_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void FloorDivBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+                              const Shape &unextended_input2_shape, const T *input2_data,
+                              const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::floor(
+            std::divides<double>()(static_cast<double>(in1_val), static_cast<double>(in2_val)));
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void FloorDivElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+                                T *output_data)
+{
+
+  int num_elements = shape.FlatSize();
+
+  for (int t = 0; t < num_elements; t++)
+  {
+    output_data[t] = std::floor(std::divides<double>()(static_cast<double>(input1_data[t]),
+                                                       static_cast<double>(input2_data[t])));
+  }
+}
+
+} // namespace cker
+
+} // namespace nnfw
+#endif
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 4280c9ae2..71a2f19ef 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -19,10 +19,14 @@
 #define __NNFW_CKER_FULLY_CONNECTED_H__
 
 #include <ruy/context.h>
+#include "cker/operation/FullyConnectedDense16x1.h"
+#include "cker/operation/FullyConnectedSparse16x1.h"
+#include "cker/operation/optimized/Gemm.h"
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
 #include "cker/TensorUtils.h"
+#include "cker/neon/neon_check.h"
 
 namespace nnfw
 {
@@ -55,6 +59,42 @@ public:
   std::vector<int32_t> accum_scratch;
 };
 
+#if defined(CKER_X86_PLATFORM)
+
+// From tensorflow/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+                           const float *input_data, const Shape &weights_shape,
+                           const float *weights_data, const Shape &,
+                           const float *optional_bias_data, const Shape &output_shape,
+                           float *output_data)
+{
+  const int dims_count = weights_shape.DimensionsCount();
+  const int input_rows = weights_shape.Dims(dims_count - 1);
+  MatrixParams<float> rhs_params;
+  rhs_params.order = Order::kColMajor;
+  rhs_params.rows = input_rows;
+  rhs_params.cols = input_shape.FlatSize() / input_rows;
+  rhs_params.cache_policy = optimized::DefaultCachePolicy(params.rhs_cacheable);
+
+  MatrixParams<float> lhs_params;
+  lhs_params.order = Order::kRowMajor;
+  lhs_params.cols = weights_shape.Dims(dims_count - 1);
+  lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+  lhs_params.cache_policy = optimized::DefaultCachePolicy(params.lhs_cacheable);
+  MatrixParams<float> dst_params;
+  dst_params.order = Order::kColMajor;
+  dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  GemmParams<float, float> gemm_params;
+  gemm_params.bias = optional_bias_data;
+  gemm_params.clamp_min = params.float_activation_min;
+  gemm_params.clamp_max = params.float_activation_max;
+  optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data,
+                  gemm_params);
+}
+
+#else // CKER_X86_PLATFORM
+
 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
                            const float *input_data, const Shape &weights_shape,
                            const float *weights_data, const Shape &, const float *bias_data,
@@ -86,6 +126,8 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
   }
 }
 
+#endif // CKER_X86_PLATFORM
+
 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
                            const uint8_t *input_data, const Shape &filter_shape,
                            const uint8_t *filter_data, const Shape &bias_shape,
@@ -114,7 +156,7 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
   const int output_depth =
-      MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+    MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b)
   {
@@ -208,12 +250,13 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
   return;
 }
 
-inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const Shape &input_shape,
-                                       const float *input_data, const Shape &weights_shape,
-                                       const float *weights_data, const Shape &bias_shape,
-                                       const float *bias_data, const Shape &output_shape,
-                                       float *output_data, int w0_size, const uint16_t *w1_segments,
-                                       const uint16_t *w1_indices)
+inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
+                                             const Shape &input_shape, const float *input_data,
+                                             const Shape &weights_shape, const float *weights_data,
+                                             const Shape &bias_shape, const float *bias_data,
+                                             const Shape &output_shape, float *output_data,
+                                             const uint16_t *w1_segments,
+                                             const uint16_t *w1_indices)
 {
   UNUSED_RELEASE(params);
   UNUSED_RELEASE(input_shape);
@@ -225,7 +268,7 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const
   const int weights_dims_count = weights_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
   const int output_depth =
-      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
   UNUSED_RELEASE(bias_shape);
@@ -239,13 +282,13 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const
   }
   for (int b = 0; b < batches; ++b)
   {
-    for (int idx_0 = 0; idx_0 < w0_size; ++idx_0)
+    for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
     {
       for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
       {
         int idx_1 = w1_indices[pw1];
         output_data[b * output_depth + idx_0] +=
-            weights_data[pw1] * input_data[b * accum_depth + idx_1];
+          weights_data[pw1] * input_data[b * accum_depth + idx_1];
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/FullyConnectedDense16x1.h b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h
new file mode 100644
index 000000000..a7e9efd7f
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+#if defined(__aarch64__) && defined(USE_NEON)
+inline void FullyConnected16x1Float32(const FullyConnectedParams &params, const Shape &input_shape,
+                                      const float *input_data, const Shape &weights_shape,
+                                      const float *weights_data, const Shape &,
+                                      const float *bias_data, const Shape &, float *output_data)
+{
+  int total_input_size = input_shape.FlatSize();
+  int input_size = weights_shape.Dims(1);
+  const int batch_size = total_input_size / input_size;
+  const int num_units = weights_shape.Dims(0);
+
+  float *out = output_data;
+  const float *weights = weights_data;
+  int rows = num_units;
+  int cols = input_size;
+  int col_stride = input_size;
+  const float *x = input_data;
+
+  // Output = bias if bias tensor exists.
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batch_size * num_units);
+  }
+
+  //  rows : out, cols : in
+  int i, j;
+  for (i = 0; i < rows; i += 16)
+  {
+    const float *w = &weights[i * col_stride];
+
+    /* keep y[0..15] in registers for duration of inner loop */
+    float *__restrict y = &out[i];
+
+    float32x4_t y0_3 = vld1q_f32(&y[0]);
+    float32x4_t y4_7 = vld1q_f32(&y[4]);
+    float32x4_t y8_11 = vld1q_f32(&y[8]);
+    float32x4_t y12_15 = vld1q_f32(&y[12]);
+
+    for (j = 0; j < cols; j++)
+    {
+      float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+      float32x4_t xj;
+
+      xj = vld1q_dup_f32(&x[j]);
+
+      wvec0_3 = vld1q_f32(&w[0]);
+      y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
+      wvec4_7 = vld1q_f32(&w[4]);
+      y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
+      wvec8_11 = vld1q_f32(&w[8]);
+      y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
+      wvec12_15 = vld1q_f32(&w[12]);
+      y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
+
+      w += 16;
+    }
+
+    /* save y[0..15] back to memory */
+
+    vst1q_f32(&y[0], y0_3);
+    vst1q_f32(&y[4], y4_7);
+    vst1q_f32(&y[8], y8_11);
+    vst1q_f32(&y[12], y12_15);
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
+}
+#endif
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
new file mode 100644
index 000000000..df397f73e
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
+                                           const Shape &input_shape, const float *input_data,
+                                           const Shape &weights_shape, const float *weights_data,
+                                           const Shape &bias_shape, const float *bias_data,
+                                           const Shape &output_shape, float *output_data,
+                                           const uint16_t *w1_segments, const uint16_t *w1_indices)
+{
+  UNUSED_RELEASE(input_shape);
+
+  assert(weights_shape.DimensionsCount() == 2);
+  assert(output_shape.DimensionsCount() == 2);
+
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth =
+    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+
+  UNUSED_RELEASE(bias_shape);
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batches * output_depth);
+  }
+  for (int b = 0; b < batches; ++b)
+  {
+    int depth_size = output_depth / 16;
+    for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
+#ifdef USE_NEON
+    {
+      float *__restrict y;
+      y = &output_data[b * output_depth + idx_0 * 16];
+      /* keep y[0..15] in registers for duration of inner loop */
+      float32x4_t y0_3 = vld1q_f32(&y[0]);
+      float32x4_t y4_7 = vld1q_f32(&y[4]);
+      float32x4_t y8_11 = vld1q_f32(&y[8]);
+      float32x4_t y12_15 = vld1q_f32(&y[12]);
+      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        auto idx_1 = w1_indices[pw1];
+        float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
+        float32x4_t wvec;
+
+        wvec = vld1q_f32(&weights_data[0]);
+        y0_3 = vmlaq_f32(y0_3, wvec, xj);
+        wvec = vld1q_f32(&weights_data[4]);
+        y4_7 = vmlaq_f32(y4_7, wvec, xj);
+        wvec = vld1q_f32(&weights_data[8]);
+        y8_11 = vmlaq_f32(y8_11, wvec, xj);
+        wvec = vld1q_f32(&weights_data[12]);
+        y12_15 = vmlaq_f32(y12_15, wvec, xj);
+
+        weights_data += 16;
+      }
+      /* save y[0..15] back to memory */
+      vst1q_f32(&y[0], y0_3);
+      vst1q_f32(&y[4], y4_7);
+      vst1q_f32(&y[8], y8_11);
+      vst1q_f32(&y[12], y12_15);
+    }
+#else
+    {
+      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        float *__restrict y;
+        float xj;
+        auto idx_1 = w1_indices[pw1];
+        xj = input_data[b * accum_depth + idx_1];
+        y = &output_data[b * output_depth + idx_0 * 16];
+        y[0] += weights_data[0] * xj;
+        y[1] += weights_data[1] * xj;
+        y[2] += weights_data[2] * xj;
+        y[3] += weights_data[3] * xj;
+        y[4] += weights_data[4] * xj;
+        y[5] += weights_data[5] * xj;
+        y[6] += weights_data[6] * xj;
+        y[7] += weights_data[7] * xj;
+        y[8] += weights_data[8] * xj;
+        y[9] += weights_data[9] * xj;
+        y[10] += weights_data[10] * xj;
+        y[11] += weights_data[11] * xj;
+        y[12] += weights_data[12] * xj;
+        y[13] += weights_data[13] * xj;
+        y[14] += weights_data[14] * xj;
+        y[15] += weights_data[15] * xj;
+        weights_data += 16;
+      }
+    }
+#endif
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
+  }
+}
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h
index d17a5796b..8a97d8421 100644
--- a/compute/cker/include/cker/operation/FusedBatchNorm.h
+++ b/compute/cker/include/cker/operation/FusedBatchNorm.h
@@ -105,7 +105,7 @@ public:
     float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size));
     // This adjustment is for Bessel's correction
     float rest_size_adjust =
-        static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
+      static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
 
     Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth);
     Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth);
@@ -117,12 +117,12 @@ public:
 
     batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
     auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale)
-                              .eval()
-                              .reshape(one_by_depth)
-                              .broadcast(bcast_spec);
+                            .eval()
+                            .reshape(one_by_depth)
+                            .broadcast(bcast_spec);
     auto x_scaled = x_centered * scaling_factor;
     auto x_shifted =
-        (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
+      (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
 
     UNUSED_RELEASE(rest_size_adjust);
 
diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h
index a0abf2935..211db98ce 100644
--- a/compute/cker/include/cker/operation/Helper/BCast.h
+++ b/compute/cker/include/cker/operation/Helper/BCast.h
@@ -22,7 +22,7 @@
  * ToDo : This file will be moved into upper folder when integrate with other
  *        custom operations.
  *        And It should merged with EinsumHelper's BCast.
-**/
+ **/
 
 #include "cker/Shape.h"
 #include "cker/eigen/EigenSupport.h"
@@ -393,7 +393,7 @@ public:
 
   BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true,
         const bool return_flattened_batch_indices = false)
-      : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
+    : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
   {
   }
 
diff --git a/compute/cker/include/cker/operation/Helper/MatmulBCast.h b/compute/cker/include/cker/operation/Helper/MatmulBCast.h
index b80ccc0d0..b7d639433 100644
--- a/compute/cker/include/cker/operation/Helper/MatmulBCast.h
+++ b/compute/cker/include/cker/operation/Helper/MatmulBCast.h
@@ -62,13 +62,13 @@ public:
     if (!_batch_bcast->IsValid())
       return;
 
-    auto x_reshaped = _batch_bcast->x_reshape();
-    auto y_reshaped = _batch_bcast->y_reshape();
+    const auto &x_reshaped = _batch_bcast->x_reshape();
+    const auto &y_reshaped = _batch_bcast->y_reshape();
     auto output_shape = _batch_bcast->output_shape();
 
     _x_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1),
                                     std::multiplies<int32_t>());
-    _y_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1),
+    _y_batch_size = std::accumulate(y_reshaped.cbegin(), y_reshaped.cend(), INT32_C(1),
                                     std::multiplies<int32_t>());
     _output_shape.ReplaceWith(output_shape.size(), output_shape.data());
     _output_batch_size = _output_shape.FlatSize();
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
index baeafd7c9..f16e5019d 100644
--- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -168,7 +168,7 @@ public:
 
   // Must have lo < hi
   UniformDistribution(int32_t lo, int32_t hi)
-      : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
+    : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
   {
   }
 
@@ -207,7 +207,7 @@ public:
 
   // Must have lo < hi
   UniformDistribution(int64_t lo, int64_t hi)
-      : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
+    : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
   {
   }
 
@@ -291,22 +291,22 @@ public:
 
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int32_t>
-    : public UniformFullIntDistribution32<Generator, int32_t>
+  : public UniformFullIntDistribution32<Generator, int32_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, uint32_t>
-    : public UniformFullIntDistribution32<Generator, uint32_t>
+  : public UniformFullIntDistribution32<Generator, uint32_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int64_t>
-    : public UniformFullIntDistribution64<Generator, int64_t>
+  : public UniformFullIntDistribution64<Generator, int64_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, uint64_t>
-    : public UniformFullIntDistribution64<Generator, uint64_t>
+  : public UniformFullIntDistribution64<Generator, uint64_t>
 {
 };
 
@@ -324,7 +324,7 @@ public:
 
   PHILOX_DEVICE_INLINE
   explicit SingleSampleAdapter(Generator *gen)
-      : generator_(gen), used_result_index_(Generator::kResultElementCount)
+    : generator_(gen), used_result_index_(Generator::kResultElementCount)
   {
   }
 
@@ -615,8 +615,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double>
 public:
   // The number of elements that will be returned.
   static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
-                                                 ? SingleSampleGenerator::kNativeElementCount / 2
-                                                 : 1;
+                                               ? SingleSampleGenerator::kNativeElementCount / 2
+                                               : 1;
   // Cost of generation of a single element (in cycles).
   static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
@@ -772,7 +772,7 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
 }
 
 } // namespace random
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 
 #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h
index 7dc51fe94..6b7049ddf 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOp.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOp.h
@@ -47,6 +47,6 @@ template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution>
 };
 
 } // namespace functor
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 #endif // __NNFW_CKER_HELPER_RANDOM_OP_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
index 85d267723..c99f69709 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -109,7 +109,7 @@ template <class Distribution> struct FillPhiloxRandomTask<Distribution, true>
   {
     const int kGroupSize = Distribution::kResultElementCount;
     static const int kGeneratorSkipPerOutputGroup =
-        kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
+      kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
 
     int64_t offset = 0;
 
@@ -157,7 +157,7 @@ operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *d
 
 } // namespace functor
 
-} // end namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 
 #endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h
index e6ac008a5..ec29a15c3 100644
--- a/compute/cker/include/cker/operation/Helper/Tensor.h
+++ b/compute/cker/include/cker/operation/Helper/Tensor.h
@@ -29,58 +29,58 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> str
 {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      Tensor;
+    Tensor;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
-      ConstTensor;
+    ConstTensor;
 
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
-      UnalignedConstTensor;
+    UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned>
-      Tensor32Bit;
+    Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
-      Scalar;
+    Scalar;
   typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstScalar;
+    Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+    ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedScalar;
+    UnalignedScalar;
   typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedConstScalar;
+    Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+    UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstFlat;
+    ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstVec;
+    ConstVec;
 
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedConstFlat;
+    UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstMatrix;
+    ConstMatrix;
 
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
-      UnalignedConstMatrix;
+    UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
index 6445e8a2b..8fa8b03bc 100644
--- a/compute/cker/include/cker/operation/InstanceNorm.h
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -78,8 +78,8 @@ inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_sh
           double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
           double output_value = input_value * a + b;
           output_data[Offset(output_shape, batch, height, width, channel)] =
-              ActivationFunctionWithMinMax((float)output_value, output_activation_min,
-                                           output_activation_max);
+            ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+                                         output_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
index a0075c3d0..c1fca91cc 100644
--- a/compute/cker/include/cker/operation/L2Normalize.h
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -77,7 +77,7 @@ void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uin
     {
       int32_t diff = *input_data - input_zero_point;
       int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
       int32_t unclamped_output_val = 128 + rescaled_diff;
       int32_t output_val = std::min(static_cast<int32_t>(255),
                                     std::max(static_cast<int32_t>(0), unclamped_output_val));
diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h
new file mode 100644
index 000000000..a8f1f8ca3
--- /dev/null
+++ b/compute/cker/include/cker/operation/LSTM.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
+#define __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "cker/TensorUtils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// LINT.IfChange
+// Calculates a single LSTM gate.
+//
+// Implements the following formula: (* is matrix multiply)
+//   gate = activate(W_input    * input + W_aux       * aux_input   +
+//                   W_peephole * cell  + W_recurrent * prev_output + bias)
+// with layer norm:
+//   gate = activate(W_norm * normalize(...) + bias) // not adding bias inside
+//
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+//
+// Parameters:
+// Input vectors (to LSTM):    | Size:                | Optional?
+//   input                     | n_input              |
+//   aux_input                 | n_aux_input          | y (bidir LSTM)
+// Input vectors (persistent states):
+//   output_state              | n_output             |
+//   cell_state                | n_cell               |
+// 'Constant' inputs:
+//   input_to_gate_weights     | n_cell * n_input     |
+//   aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM)
+//   recurrent_to_gate_weights | n_cell * n_output    |
+//   cell_to_gate_weights      | n_cell               | y (peephole)
+//   gate_bias                 | n_cell               |
+//   layer_norm_coefficients   | n_cell               | y (layer norm)
+// Output vector:
+//   gate                      | n_cell               |
+// Scalar parameters:
+//   n_batch                                    - batch size / number of vectors
+//   n_input, n_aux_input, n_output, n_cell     - size of vectors.
+//   activation                                 - activation to use.
+//   is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero.
+//   use_layer_norm                             - if doing layer norm LSTM.
+inline void CalculateLstmGateFloat(const float *input, const float *input_to_gate_weights,
+                                   const float *aux_input, const float *aux_input_to_gate_weights,
+                                   const float *output_state,
+                                   const float *recurrent_to_gate_weights, const float *cell_state,
+                                   const float *cell_to_gate_weights,
+                                   const float *layer_norm_coefficients, const float *gate_bias,
+                                   const int n_batch, const int n_input, const int n_aux_input,
+                                   const int n_output, const int n_cell,
+                                   const FusedActivationFunctionType activation, float *gate,
+                                   const bool is_input_all_zeros, const bool is_aux_input_all_zeros)
+{
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm)
+  {
+    std::fill_n(gate, n_cell * n_batch, 0.0f);
+  }
+  else
+  {
+    VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros)
+  {
+    MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch,
+                                        gate, /*result_stride=*/1);
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros)
+  {
+    MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
+                                        n_batch, gate, /*result_stride=*/1);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state,
+                                      n_batch, gate, /*result_stride=*/1);
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole)
+  {
+    VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch,
+                                            gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm)
+  {
+    MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate);
+    VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  ApplyActivationToVector(gate, n_batch * n_cell, activation, gate);
+}
+
+// Updates the LSTM cell state, used by both float and hybrid LSTM versions.
+//
+// Implements the following formula:
+//   cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate)
+//
+// With CIFG LSTM, input gate is replaced by (1-forget_gate).
+//
+// Parameters:
+//  - n_batch, n_cell: sizes of vectors
+//  - cell_state: input/output vector, size n_batch*n_cell
+//  - input_gate: input vector, size n_batch*n_cell.
+//  - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG
+//  - cell_gate: input vector, size n_batch*n_cell.
+//  - use_cifg: use 1-forget_gate instead of input_gate.
+//  - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellFloat(int n_batch, int n_cell, float *cell_state, const float *input_gate,
+                         float *forget_gate, const float *cell_gate, bool use_cifg, float clip)
+{
+  // Define variable for 4th argument to avoid warning
+  // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
+  const float *cwise_product_rhs = cell_state;
+  VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state);
+
+  if (use_cifg)
+  {
+    // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
+    // scratch, as input_gate array is not allocated in this case. (Be careful
+    // not to write to the scratch before reading the forget gate data.)
+    float *scratch = forget_gate;
+    Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state);
+  }
+  else
+  {
+    VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state);
+  }
+  if (clip > 0.0f)
+  {
+    CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
+// Calculates the output state tensor of an LSTM step.
+//
+// Implements the following formula:
+//   output_no_projection = output_gate .* activate(cell_state)
+//     (elementwise vector product)
+// If no projection is used:
+//   output = output_state = output_no_projection
+// With projection:
+//   output = output_state = clip(W*output_no_projection + bias)
+//
+// Output might not have a different 'stride' than n_batch, so we need to copy.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, projection_weights_scale, projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch: scratch area, size n_batch*n_cell.
+void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float *cell_state,
+                              const float *output_gate, FusedActivationFunctionType activation,
+                              const float *projection_weights, const float *projection_bias,
+                              const float proj_clip, float *output_state, float *scratch)
+{
+  ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch);
+
+  // Define variable for 4th argument to avoid warning
+  // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
+  const float *cwise_product_rhs = scratch;
+  VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection)
+  {
+    if (use_projection_bias)
+    {
+      VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state);
+    }
+    else
+    {
+      std::fill_n(output_state, n_batch * n_output, 0.0f);
+    }
+    MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch,
+                                        output_state, /*result_stride=*/1);
+    if (proj_clip > 0.0f)
+    {
+      CwiseClipping(output_state, n_batch * n_output, proj_clip);
+    }
+  }
+  else
+  {
+    std::copy_n(scratch, n_batch * n_output, output_state);
+  }
+}
+
+// Performs an LSTM batch inference step for input specified by input_ptr.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+//  - params: various LSTM params including activation, clipping, etc.,
+//  - n_batch: size of batch,
+//  - n_cell: number of cells (or units),
+//  - n_input: the input size,
+//  - n_aux_input: the auxiliary input size.
+//  - n_output: the output size.
+//  - output_batch_leading_dim: the leading dimension of the output buffer.
+//
+// Input of size 'n_batch * n_input':
+//   input_ptr
+// Input of size 'n_batch * n_aux_input':
+//   aux_input_ptr                     - optional (can be nullptr)
+//
+// LSTM weights:
+// Input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_output_weights
+// Auxiliary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
+// Recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
+// The pointers to the cell and output state and the output are updated.
+//
+// The pointers input_ptr, aux_input_ptr, and output_ptr point to data aligned
+// in batch_major order, and each step processes batch_size many inputs from
+// input_ptr, and updates batch_size many cell and output states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to n_output. It is usually not
+// when we want to store the LSTM output into a slice of the output tensor, e.g.
+// for bidirectional LSTMs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+// LINT.IfChange
+inline void LstmStepFloat(
+  const float *input_ptr, const float *input_to_input_weights_ptr,
+  const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
+  const float *input_to_output_weights_ptr, const float *aux_input_ptr,
+  const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
+  const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
+  const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
+  const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
+  const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
+  const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
+  const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
+  const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
+  const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
+  const float *output_gate_bias_ptr, const float *projection_weights_ptr,
+  const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input,
+  int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr,
+  float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3,
+  float *output_ptr)
+{
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+
+  // Make named scratch buffers.
+  float *input_gate_scratch = scratch0;
+  float *forget_gate_scratch = scratch1;
+  float *cell_gate_scratch = scratch2;
+  float *output_gate_scratch = scratch3;
+
+  // Check if inputs are all zeros so we can skip some computations.
+  const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+    (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+  if (!use_cifg)
+  {
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+                           aux_input_to_input_weights_ptr, output_state_ptr,
+                           recurrent_to_input_weights_ptr, cell_state_ptr,
+                           cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
+                           input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+                           /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+                           input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+  }
+  // Calculate the forget gate.
+  CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
+                         aux_input_to_forget_weights_ptr, output_state_ptr,
+                         recurrent_to_forget_weights_ptr, cell_state_ptr,
+                         cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
+                         forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+                         /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+                         forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+  // Calculate the cell update gate.
+  CalculateLstmGateFloat(
+    input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
+    output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+    /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch,
+    n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
+    is_input_all_zeros, is_aux_input_all_zeros);
+  // Update the cell state.
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
+                      cell_gate_scratch, use_cifg, params->cell_clip);
+  // Calculate output gate.
+  CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr,
+                         aux_input_to_output_weights_ptr, output_state_ptr,
+                         recurrent_to_output_weights_ptr, cell_state_ptr,
+                         cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
+                         output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+                         /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+                         output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+  // Update the output state.
+  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+                           params->activation, projection_weights_ptr, projection_bias_ptr,
+                           params->proj_clip, output_state_ptr, scratch2);
+  // Copy output state to the output. Note that the output's rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
+  for (int b = 0; b < n_batch; b++)
+  {
+    std::copy_n(output_state_ptr + b * n_output, n_output,
+                output_ptr + b * output_batch_leading_dim);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compute/cker/include/cker/operation/LeakyReLU.h b/compute/cker/include/cker/operation/LeakyReLU.h
new file mode 100644
index 000000000..e12d01bba
--- /dev/null
+++ b/compute/cker/include/cker/operation/LeakyReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LEKAY_RELU_H__
+#define __NNFW_CKER_LEKAY_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void LeakyReLU(const LeakyReluParams &params, const Shape &input_shape,
+                      const float *input_data, const Shape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    const float val = input_data[i];
+    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+    output_data[i] = val > 0 ? val : val * params.alpha;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU_H__
diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h
index 326a44f0c..eb7bdd900 100644
--- a/compute/cker/include/cker/operation/LogSoftMax.h
+++ b/compute/cker/include/cker/operation/LogSoftMax.h
@@ -71,7 +71,7 @@ inline void LogSoftmax(const SoftmaxParams &params, const Shape &input_shape,
       for (int c = 0; c < depth; ++c)
       {
         output_data[(i * depth + c) * inner_size + j] =
-            (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
+          (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
       }
     }
   }
@@ -124,10 +124,10 @@ inline void LogSoftmax(const SoftmaxParams &params, float input_scale, const Sha
       for (int c = 0; c < depth; ++c)
       {
         const float log_prob =
-            scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
+          scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
         const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
         output_data[(i * depth + c) * inner_size] =
-            static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+          static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/LogicalAnd.h b/compute/cker/include/cker/operation/LogicalAnd.h
new file mode 100644
index 000000000..e877f5f47
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogicalAnd.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGICAL_AND_H__
+#define __NNFW_CKER_LOGICAL_AND_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void LogicalAndBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+                                const Shape &unextended_input2_shape, const T *input2_data,
+                                const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = in1_val && in2_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void LogicalAndElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+                                  T *output_data)
+{
+
+  int num_elements = shape.FlatSize();
+
+  for (int t = 0; t < num_elements; t++)
+  {
+    output_data[t] = input1_data[t] && input2_data[t];
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGICAL_AND_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
index 3d3e59e55..e9907729e 100644
--- a/compute/cker/include/cker/operation/Logistic.h
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -29,12 +29,39 @@ namespace nnfw
 namespace cker
 {
 
+/**
+ * @brief Internal scalar_logistic_op operation struct
+ *
+ * @note  Recent Eigen3 scalar_logistic_op return invalid value on ARM32 if
+ *        input value is float type 88 (expected: 1, actual: 0)
+ *        As a workaround, we use old version scalar_logistic_op internal struct
+ *        TODO Remove this workaround
+ */
+template <typename T> struct scalar_logistic_op
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T &x) const
+  {
+    const T one = T(1);
+    return one / (one + Eigen::numext::exp(-x));
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet &x) const
+  {
+    const Packet one = Eigen::internal::pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+
 inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
                      float *output_data)
 {
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
-  output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+
+  // Use old version scalar_logistic_op
+  output_map.array() = input_map.array().unaryExpr(nnfw::cker::scalar_logistic_op<float>());
 }
 
 } // namespace cker
diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h
index 5674ff3ef..ef2868455 100644
--- a/compute/cker/include/cker/operation/MatrixBandPart.h
+++ b/compute/cker/include/cker/operation/MatrixBandPart.h
@@ -43,11 +43,11 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
 
   if (!(num_lower_diags <= row_num))
     throw std::runtime_error(
-        "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
+      "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
 
   if (!(num_upper_diags <= col_num))
     throw std::runtime_error(
-        "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
+      "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
 
   std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
 
@@ -60,9 +60,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
       auto input = input_data + (batch * row_num * col_num + row * col_num);
 
       const T band_start =
-          num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
-      const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num),
-                                                                  row + num_upper_diags + 1);
+        num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
+      const T band_end = num_upper_diags < 0
+                           ? col_num
+                           : std::min(static_cast<T>(col_num), row + num_upper_diags + 1);
 
       for (T band_idx = band_start; band_idx < band_end; band_idx++)
       {
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
index ea3fcaca6..5dc84d368 100644
--- a/compute/cker/include/cker/operation/MaxPool.h
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -67,10 +67,10 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
         int hpad = h + params.padding_values.height;
         int wpad = w + params.padding_values.width;
         int h_start =
-            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
         int h_end = std::min(hpad / stride_height + 1, output_height);
         int w_start =
-            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
         int w_end = std::min(wpad / stride_width + 1, output_width);
         // compute elementwise sum
         for (int ph = h_start; ph < h_end; ++ph)
@@ -79,8 +79,8 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
           {
             int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
             out_mat.col(out_offset) =
-                out_mat.col(out_offset)
-                    .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+              out_mat.col(out_offset)
+                .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
           }
         }
       }
@@ -139,8 +139,8 @@ void MaxPool<uint8_t>(const PoolParams &params, const Shape &input_shape, const
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h
index c0dbc6df5..ddc27b4c2 100644
--- a/compute/cker/include/cker/operation/OneHot.h
+++ b/compute/cker/include/cker/operation/OneHot.h
@@ -55,7 +55,7 @@ void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t ax
       for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
       {
         *output_data =
-            static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
+          static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
index 5c82d111f..7292a199a 100644
--- a/compute/cker/include/cker/operation/Quantize.h
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.*
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +18,14 @@
 #ifndef __NNFW_CKER_QUANTIZE_H__
 #define __NNFW_CKER_QUANTIZE_H__
 
+#include "cker/operation/Round.h"
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
-#include <stdexcept>
+#include <cassert>
 #include <iostream>
+#include <stdexcept>
+
 namespace nnfw
 {
 namespace cker
@@ -41,6 +45,409 @@ inline void Quantize(const Shape &input_shape, const InputT *input_data, const S
     output_data[i] = clamped;
   }
 }
+
+template <>
+inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                     int8_t *output_data, const float scale, const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const float *src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+    const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
+    const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
+    vst1_s8(output_data + i, combined_val_narrowed);
+  }
+#endif // NEON
+
+  for (; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+template <>
+inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                     uint8_t *output_data, const float scale, const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const float *src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
+    const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
+    const uint16x8_t combined_val = vcombine_u16(narrowed_val_0, narrowed_val_1);
+    const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
+    vst1_u8(output_data + i, combined_val_narrowed);
+  }
+#endif // NEON
+
+  for (; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+template <>
+inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                     int16_t *output_data, const float scale, const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const float *src_data_ptr = input_data + i;
+    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+    // Clamp the values to fit the target type's range.
+    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+    vst1_s16(output_data + i, narrowed_val_0);
+    vst1_s16(output_data + i + 4, narrowed_val_1);
+  }
+#endif // NEON
+
+  for (; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+inline void Quantize(const int32_t *multiplier, const int32_t *shift, int32_t channel_size,
+                     int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t *scratch, int8_t *output)
+{
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t zeros = vdupq_n_s32(0);
+#endif
+
+  assert(total_size % channel_size == 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  using gemmlowp::RoundingDivideByPOT;
+  for (; c <= channel_size - 8; c += 8)
+  {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+    int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
+    int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
+
+    // Right shift will be performed as left shift with negative values.
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n)
+    {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Rounding Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int8 and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
+      const int8x8_t res_s8 = vqmovn_s16(res_s16);
+      vst1_s8(output + loc, res_s8);
+    }
+  }
+
+#endif // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++)
+  {
+    for (int n = 0; n < rows; ++n)
+    {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type *input_data, int32_t size,
+                       int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+                       int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data)
+{
+  assert(!"Requantize: not supported type. It shouldn't reach here.");
+  UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint,
+             output_zeropoint, output_data);
+}
+
+template <>
+inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift, int32_t input_zeropoint,
+                                        int32_t output_zeropoint, int8_t *output_data)
+{
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result =
+      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
+    const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i)
+  {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+      output_zeropoint;
+    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift, int32_t input_zeropoint,
+                                        int32_t output_zeropoint, uint8_t *output_data)
+{
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vmovl_s16(vget_low_s16(first_half));
+    input.val[1] = vmovl_s16(vget_high_s16(first_half));
+    input.val[2] = vmovl_s16(vget_low_s16(second_half));
+    input.val[3] = vmovl_s16(vget_high_s16(second_half));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result =
+      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
+    const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
+    const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
+    const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
+
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i)
+  {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+      output_zeropoint;
+    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h
index 5c3a773a2..d6ccc68c8 100644
--- a/compute/cker/include/cker/operation/Range.h
+++ b/compute/cker/include/cker/operation/Range.h
@@ -35,8 +35,8 @@ template <typename T> inline int GetSize(T start, T limit, T delta)
   }
 
   int size = (std::is_integral<T>::value
-                  ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
-                  : std::ceil(std::abs((limit - start) / delta)));
+                ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+                : std::ceil(std::abs((limit - start) / delta)));
   return size;
 }
 
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index cf9634a67..02a9eac5e 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -21,6 +21,7 @@
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
 
 namespace nnfw
 {
@@ -30,6 +31,89 @@ namespace cker
 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
 // This method iterates through input data and reduce elements along the
 // dimensions given in axis.
+
+#ifdef USE_NEON
+inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
+                               float *output_data)
+{
+  const auto input_dims = input_shape.DimsData();
+  const auto input_num_dims = input_shape.DimensionsCount();
+
+  int input_size = 1;
+  int reduce_size = 0;
+  for (int idx = 0; idx < input_num_dims - 1; idx++)
+  {
+    input_size *= input_dims[idx];
+  }
+  reduce_size = input_dims[input_num_dims - 1];
+  int offset = 0;
+  for (int idx = 0; idx < input_size; idx++)
+  {
+    int r_idx = 0;
+    float tmp_data[4] = {
+      0,
+    };
+    float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
+    for (; r_idx <= reduce_size - 32; r_idx += 32)
+    {
+      float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+      float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16);
+      float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20);
+      float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24);
+      float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28);
+
+      float32x4_t x0 = vaddq_f32(a10, a20);
+      float32x4_t x1 = vaddq_f32(a11, a21);
+      float32x4_t x2 = vaddq_f32(a12, a22);
+      float32x4_t x3 = vaddq_f32(a13, a23);
+
+      float32x4_t y0 = vaddq_f32(x0, x1);
+      float32x4_t y1 = vaddq_f32(x2, x3);
+      float32x4_t y2 = vaddq_f32(y0, y1);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
+    }
+    for (; r_idx <= reduce_size - 16; r_idx += 16)
+    {
+      float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+
+      float32x4_t x0 = vaddq_f32(a10, a11);
+      float32x4_t x1 = vaddq_f32(a12, a13);
+
+      float32x4_t y0 = vaddq_f32(x0, x1);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0);
+    }
+    for (; r_idx <= reduce_size - 8; r_idx += 8)
+    {
+      float32x4_t a1 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t x = vaddq_f32(a1, a2);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
+    }
+    vst1q_f32(tmp_data, tmp_data_32x4);
+    output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
+
+    for (; r_idx < reduce_size; r_idx++)
+    {
+      if (r_idx == 0)
+      {
+        output_data[idx] = input_data[offset];
+      }
+      else
+      {
+        output_data[idx] += input_data[offset + r_idx];
+      }
+    }
+    offset += reduce_size;
+  }
+}
+#endif // NEON
+
 template <typename In, typename Out>
 inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
                        const int *axis, const int num_axis, int *input_iter,
@@ -39,6 +123,32 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
   const auto input_num_dims = input_shape.DimensionsCount();
 
   // Reset input iterator.
+  if (num_axis == 1 && axis[0] == input_num_dims - 1)
+  {
+    int input_size = 1;
+    int reduce_size = 0;
+    for (int idx = 0; idx < input_num_dims - 1; idx++)
+    {
+      input_size *= input_dims[idx];
+    }
+    reduce_size = input_dims[input_num_dims - 1];
+    for (int idx = 0; idx < input_size; idx++)
+    {
+      for (int r_idx = 0; r_idx < reduce_size; r_idx++)
+      {
+        if (r_idx == 0)
+        {
+          output_data[idx] = input_data[idx * reduce_size];
+        }
+        else
+        {
+          output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
+        }
+      }
+    }
+    return true;
+  }
+
   for (int idx = 0; idx < input_num_dims; ++idx)
   {
     input_iter[idx] = 0;
@@ -48,7 +158,7 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return true;
@@ -202,12 +312,12 @@ public:
     }
 
     // Calculate mean by dividing output_data by num of aggregated element.
-    U num_elements_in_axis = 1;
+    size_t num_elements_in_axis = 1;
     for (int idx = 0; idx < num_resolved_axis; ++idx)
     {
       size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
       // Overflow prevention.
-      if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
+      if (current > static_cast<size_t>(std::numeric_limits<size_t>::max() / num_elements_in_axis))
       {
         return false;
       }
@@ -220,21 +330,21 @@ public:
       if (compute_sum)
       {
         // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-        const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
+        const float bias = -input_zero_point * scale * num_elements_in_axis;
         for (size_t idx = 0; idx < num_outputs; ++idx)
         {
           const U value =
-              static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
+            static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
           output_data[idx] = static_cast<T>(value);
         }
       }
       else
       {
-        const float bias = -input_zero_point * scale + 0.5f;
+        const float bias = -input_zero_point * scale;
         for (size_t idx = 0; idx < num_outputs; ++idx)
         {
           float float_mean =
-              static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
+            static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
           float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
                                   static_cast<float>(std::numeric_limits<T>::max()));
           result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h
index 2e4fc6274..924e85037 100644
--- a/compute/cker/include/cker/operation/ReduceMean.h
+++ b/compute/cker/include/cker/operation/ReduceMean.h
@@ -72,9 +72,9 @@ inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     output_data[output_offset] =
-        reducer(output_data[output_offset], input_data[input_offset], normalizer);
+      reducer(output_data[output_offset], input_data[input_offset], normalizer);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return true;
 }
@@ -102,7 +102,7 @@ inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape,
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return normalizer;
@@ -185,8 +185,8 @@ public:
     }
 
     size_t normalizer =
-        ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
-                               temp_index_data(), reducer, _temp_sum.data());
+      ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
+                             temp_index_data(), reducer, _temp_sum.data());
     if (num_outputs > 0)
     {
       float scale = input_scale / output_scale;
@@ -231,6 +231,37 @@ void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_sca
                           sum_reducer);
 }
 
+template <typename In, typename Out>
+void MeanAxis1And2(const Shape &input_shape, const In *input_data, const Shape &output_shape,
+                   Out *output_data)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int output_batch = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(3);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_d = 0; out_d < output_depth; ++out_d)
+    {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h)
+      {
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height);
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
index 7fc1e9123..ae5af7bb3 100644
--- a/compute/cker/include/cker/operation/ResizeBilinear.h
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -62,7 +62,7 @@ inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t
 
     // Bottom right corner.
     output_data[output_offset + output_x_offset + output_y_offset] =
-        (output + ((x1y0 + x1y1) / 2)) / 2;
+      (output + ((x1y0 + x1y1) / 2)) / 2;
   }
 }
 
@@ -192,8 +192,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
                                    &x1);
 
         int32_t input_offset[4] = {
-            Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
-            Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
+          Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
+          Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
         float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
                           (1 - (input_y - y0)) * (input_x - x0),
                           (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
@@ -202,8 +202,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
         {
           const T *input_ptr = &input_data[d];
           *output_ptr++ = static_cast<T>(
-              input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
-              input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
+            input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
+            input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
         }
       }
     }
@@ -253,17 +253,102 @@ void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
   int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   float height_scale = (params.align_corners && params.output_height > 1)
-                           ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
-                           : (static_cast<float>(input_height) / params.output_height);
+                         ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
+                         : (static_cast<float>(input_height) / params.output_height);
 
   float width_scale = (params.align_corners && params.output_width > 1)
-                          ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
-                          : (static_cast<float>(input_width) / params.output_width);
+                        ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
+                        : (static_cast<float>(input_width) / params.output_width);
 
   ResizeBilinearGenericSmallChannel<uint8_t>(
-      batches, input_height, input_width, depth, params.output_height, params.output_width,
-      height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
+    batches, input_height, input_width, depth, params.output_height, params.output_width,
+    height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
 }
+
+inline void ComputeInterpolationValues(const int32_t value, const int32_t scale_10,
+                                       const bool half_pixel_centers, int32_t input_size,
+                                       int32_t *scaled_value, int32_t *lower_bound,
+                                       int32_t *upper_bound)
+{
+  if (half_pixel_centers)
+  {
+    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+  }
+  else
+  {
+    *scaled_value = value * scale_10;
+  }
+  *lower_bound = std::max(*scaled_value / (1 << 10), 0);
+  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+}
+
+inline void ResizeBilinear(const ResizeBilinearParams &op_params,
+                           const Shape &unextended_input_shape, const int8_t *input_data,
+                           const Shape &unextended_output_shape, int8_t *output_data)
+{
+  // If half_pixel_centers is True, align_corners must be False.
+  assert(!op_params.half_pixel_centers || !op_params.align_corners);
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32_t input_height = input_shape.Dims(1);
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  const int32_t output_height = op_params.output_height;
+  const int32_t output_width = op_params.output_width;
+
+  int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height;
+  int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width;
+  if (op_params.align_corners && output_height > 1)
+  {
+    height_scale_10 =
+      ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1)
+  {
+    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1);
+  }
+
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      int32_t input_y, y0, y1;
+      ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height,
+                                 &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        int32_t input_x, x0, x1;
+        ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width,
+                                   &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c)
+        {
+          const int64_t output_20_ll =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) *
+            ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_lu =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) *
+            (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_rl =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) *
+            ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0);
+          const int64_t output_20_ru =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) *
+            (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+          const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+          const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
+          const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20));
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Round.h b/compute/cker/include/cker/operation/Round.h
index a04a741cf..d67714564 100644
--- a/compute/cker/include/cker/operation/Round.h
+++ b/compute/cker/include/cker/operation/Round.h
@@ -19,6 +19,7 @@
 #define __NNFW_CKER_ROUND_H__
 
 #include "cker/Shape.h"
+#include "cker/Utils.h"
 
 #include <cmath>
 
@@ -41,6 +42,26 @@ inline float RoundToNearest(float value)
   }
 }
 
+#ifdef USE_NEON
+
+inline int32x4_t RoundToNearest(const float32x4_t input)
+{
+#if defined(__aarch64__) || defined(__SSSE3__)
+  // Note: vcvtnq_s32_f32 is not available in ARMv7
+  return vcvtnq_s32_f32(input);
+#else
+  static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
+  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+  static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);
+
+  const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
+  const float32x4_t round = vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
+  return vcvtq_s32_f32(vaddq_f32(input, round));
+#endif // defined(__aarch64__) || defined(__SSSE3__)
+}
+
+#endif // NEON
+
 inline void Round(const Shape &input_shape, const float *input_data, const Shape &output_shape,
                   float *output_data)
 {
diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h
index ab2de94cc..644fe0a0e 100644
--- a/compute/cker/include/cker/operation/Select.h
+++ b/compute/cker/include/cker/operation/Select.h
@@ -34,7 +34,7 @@ void Select(const Shape &input_condition_shape, const D *input_condition_data,
             const T *input_y_data, const Shape &output_shape, T *output_data)
 {
   const int64_t flatsize =
-      MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+    MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i)
   {
     output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
@@ -101,7 +101,7 @@ void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_co
           const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
           const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h
index a072cff8e..ef97fd5d8 100644
--- a/compute/cker/include/cker/operation/Slice.h
+++ b/compute/cker/include/cker/operation/Slice.h
@@ -43,16 +43,16 @@ inline void Slice(const SliceParams &op_params, const Shape &input_shape,
                                                                      : start_b + op_params.size[0];
   const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
   const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
-                         ? input_shape.Dims(1)
-                         : start_h + op_params.size[size_count - 3];
+                       ? input_shape.Dims(1)
+                       : start_h + op_params.size[size_count - 3];
   const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
   const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
-                         ? input_shape.Dims(2)
-                         : start_w + op_params.size[size_count - 2];
+                       ? input_shape.Dims(2)
+                       : start_w + op_params.size[size_count - 2];
   const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
   const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
-                         ? input_shape.Dims(3)
-                         : start_d + op_params.size[size_count - 1];
+                       ? input_shape.Dims(3)
+                       : start_d + op_params.size[size_count - 1];
 
   for (int in_b = start_b; in_b < stop_b; ++in_b)
   {
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 13e50b87a..35ecde4ba 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -23,6 +23,10 @@
 #include "cker/Types.h"
 #include "cker/eigen/Utils.h"
 
+#if __aarch64__ && __clang__
+#define TFLITE_SOFTMAX_USE_UINT16_LUT
+#endif
+
 #include <Eigen/Core>
 #include <fixedpoint/fixedpoint.h>
 #include <cmath>
@@ -32,6 +36,45 @@ namespace nnfw
 namespace cker
 {
 
+namespace reference
+{
+
+// Note. This Softmax function supports all of dimensions
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c)
+    {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c)
+    {
+      sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta));
+    }
+
+    // Compute result.
+    for (int c = 0; c < depth; ++c)
+    {
+      output_data[i * depth + c] =
+        std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
+    }
+  }
+}
+} // namespace reference
+
 // Performs softmax along the input of size (input_size * batch_size).
 inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
                     float *out)
@@ -88,87 +131,306 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
   out_mat.array().rowwise() *= scale;
 }
 
-inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
-                    const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
-{
-  const int32_t input_beta_multiplier = params.input_multiplier;
-  const int32_t input_beta_left_shift = params.input_left_shift;
-  const int diff_min = params.diff_min;
-  // The representation chosen for the input to the exp() function is Q5.26.
-  // We need to leave extra space since values that we skip might be as large as
-  // -32 before multiplying by input_beta_multiplier, and therefore as large as
-  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
-  // accumulation, but exp(-16) definitely is.
-  static const int kScaledDiffIntegerBits = 5;
-  static const int kAccumulationIntegerBits = 12;
-  using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+template <typename T> inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point)
+{
+  const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
+  return prob_rnd + zero_point;
+}
+
+#if !__aarch64__
+// With ARM64, rounding is faster than add + truncation.
+template <> inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled, int32_t)
+{
+  return static_cast<int32_t>(prob_rescaled + 0.5f);
+}
+#endif
+
+inline void PopulateSoftmaxLookupTable(float *table, float input_scale, float beta)
+{
+  const float scale = -input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  for (int32_t val = 0; val <= max_uint8; ++val)
+  {
+    table[max_uint8 - val] = expf(scale * val);
+  }
+}
 
+template <typename In, typename Out>
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const In *input_data,
+                    const Shape &output_shape, Out *output_data)
+{
   const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-  for (int i = 0; i < outer_size; ++i)
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+  for (int i = 0; i < excluding_last_dim; ++i)
   {
-    uint8_t max_in_row = 0;
-    for (int c = 0; c < depth; ++c)
+    int32_t max_val = std::numeric_limits<In>::min();
+    // Find max quantized value.
+    for (int j = 0; j < last_dim; ++j)
     {
-      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+      max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
     }
 
-    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-    for (int c = 0; c < depth; ++c)
+    float sum_exp = 0.0f;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const float *table_offset = &params.table[max_uint8 - max_val];
+    // Calculate normalizer sum(exp(x)).
+    for (int j = 0; j < last_dim; ++j)
     {
-      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
-      if (input_diff >= diff_min)
-      {
-        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-            input_diff, input_beta_multiplier, input_beta_left_shift);
-        const FixedPointScaledDiff scaled_diff_f8 =
-            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                        exp_on_negative_values(scaled_diff_f8));
-      }
+      sum_exp += table_offset[input_data[j]];
     }
 
-    int32_t fixed_sum_of_exps = sum_of_exps.raw();
-    int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
-    // This is the number of bits to the left of the binary point above 1.0.
-    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-    // no later adjustment will be needed.
-    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-    int32_t shifted_sum_minus_one =
-        static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
-                             (static_cast<uint32_t>(1) << 31));
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+    // Normalize and quantize probabilities.
+    for (int j = 0; j < last_dim; ++j)
+    {
+      const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
+      const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
+      output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+    }
+    input_data += last_dim;
+    output_data += last_dim;
+  }
+}
 
-    FixedPoint0 shifted_scale =
-        one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+// Looks up each element of <indices> in <table>, returns them in a vector.
+inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], uint8x16_t indices)
+{
+  // Look up in 1st quarter of the table: top 2 bits of indices == 00
+  uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
+  // Look up in 2nd quarter of the table: top 2 bits of indices == 01
+  uint8x16_t output2 = vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
+  // Look up in 3rd quarter of the table: top 2 bits of indices == 10
+  uint8x16_t output3 = vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
+  // Look up in 4th quarter of the table: top 2 bits of indices == 11
+  uint8x16_t output4 = vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
 
-    for (int c = 0; c < depth; ++c)
+  // Combine result of the 4 lookups.
+  return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
+}
+
+inline void PopulateSoftmaxUInt8LookupTable(uint8_t *uint8_table1, uint8_t *uint8_table2,
+                                            float input_scale, float beta)
+{
+  const float scale = input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  const int32_t max_uint16 = std::numeric_limits<uint16_t>::max();
+
+  for (int32_t val = 0; val <= max_uint8; ++val)
+  {
+    float input_to_exp = scale * (val - max_uint8);
+    int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5);
+    temp = std::min(max_uint16, temp);
+    uint8_t part1 = temp >> 8;
+    uint8_t part2 = temp & 0xff;
+    uint8_table1[val] = static_cast<uint8_t>(part1);
+    uint8_table2[val] = static_cast<uint8_t>(part2);
+  }
+}
+
+inline int FindMaxValue(int size, const uint8_t *input_data, uint8_t offset)
+{
+  int32_t max_val = std::numeric_limits<uint8_t>::min();
+  int j = 0;
+
+  uint8x16_t max_val_dup = vdupq_n_u8(max_val);
+  uint8x16_t offset_dup = vdupq_n_u8(offset);
+  for (; j <= size - 16; j += 16)
+  {
+    uint8x16_t input_value = vld1q_u8(input_data + j);
+    input_value = veorq_u8(input_value, offset_dup);
+    max_val_dup = vmaxq_u8(input_value, max_val_dup);
+  }
+  max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup)));
+
+  for (; j < size; ++j)
+  {
+    max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset));
+  }
+  return max_val;
+}
+
+#ifdef USE_NEON
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, int8_t *output)
+{
+  const int16x8_t result_1 =
+    vcombine_s16(vqmovn_s32(value_to_store.val[1]), vqmovn_s32(value_to_store.val[0]));
+  const int16x8_t result_2 =
+    vcombine_s16(vqmovn_s32(value_to_store.val[3]), vqmovn_s32(value_to_store.val[2]));
+  const int8x16_t result = vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
+  vst1q_s8(output, result);
+}
+
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, uint8_t *output)
+{
+  const uint16x8_t result_1 =
+    vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
+                 vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
+  const uint16x8_t result_2 =
+    vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
+                 vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
+  const uint8x16_t result = vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
+  vst1q_u8(output, result);
+}
+
+#endif
+
+template <typename In, typename Out>
+inline void SoftmaxInt8LUT(const SoftmaxParams &params, const Shape &input_shape,
+                           const In *input_data, const Shape &output_shape, Out *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+
+  // Offset is used to interpret the input data "correctly".
+  // If the input is uint8, the data will be unchanged.
+  // If the input is int8, since it will be reinterpret as uint8.
+  // e.g.,
+  // int8 127 will be applied "offset" to become 255 in uint8.
+  uint8_t offset = 0;
+  if (std::is_same<In, int8_t>::value)
+  {
+    offset = 0x80;
+  }
+
+  const uint8_t *input_data_uint = reinterpret_cast<const uint8_t *>(input_data);
+
+  // This code uses ARM64-only instructions.
+  // TODO(b/143709993): Port to ARMv7
+
+  // Load the tables into registers. (4*4 128-bit registers)
+  uint8x16x4_t table1[4];
+  table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0);
+  table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1);
+  table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2);
+  table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3);
+
+  uint8x16x4_t table2[4];
+  table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0);
+  table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1);
+  table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2);
+  table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3);
+
+  for (int i = 0; i < excluding_last_dim; ++i)
+  {
+    // Find max quantized value.
+    int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
+
+    int32_t sum_exp = 0;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const uint8_t table_offset = max_uint8 - max_val;
+
+    // Calculate normalizer sum(exp(x)).
+    int sum_j = 0;
+    uint8x16_t table_offset_dup = vdupq_n_u8(table_offset);
+    uint8x16_t offset_dup = vdupq_n_u8(offset);
+    uint32x4_t sum_4 = vdupq_n_u32(0);
+    const int multiplier_shift = 8;
+    for (; sum_j <= last_dim - 16; sum_j += 16)
+    {
+      uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      sum_4 = vpadalq_u16(sum_4, exp_value1);
+      sum_4 = vpadalq_u16(sum_4, exp_value2);
+    }
+    int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) + vgetq_lane_u32(sum_4, 2) +
+               vgetq_lane_u32(sum_4, 3);
+    sum_exp += temp;
+
+    for (; sum_j < last_dim; ++sum_j)
     {
-      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
-      if (input_diff >= diff_min)
-      {
-        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-            input_diff, input_beta_multiplier, input_beta_left_shift);
-        const FixedPointScaledDiff scaled_diff_f8 =
-            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
-                                                             num_bits_over_unit + 31 - 8);
-
-        output_data[i * depth + c] = static_cast<uint8_t>(
-            std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
-      }
-      else
-      {
-        output_data[i * depth + c] = 0;
-      }
+      const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset;
+
+      uint8_t part1 = params.uint8_table1[index];
+      uint8_t part2 = params.uint8_table2[index];
+      sum_exp += ((part1 << 8) + part2);
+    }
+
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+
+    int32_t multiplier, shift;
+    QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
+
+    // Normalize and quantize probabilities.
+    int j = 0;
+    const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point);
+    const int32x4_t max_val_dup = vdupq_n_s32(clamp_max);
+    const int32x4_t min_val_dup = vdupq_n_s32(clamp_min);
+
+    for (; j <= last_dim - 16; j += 16)
+    {
+      uint8x16_t input_value = vld1q_u8(input_data_uint + j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      int32x4x4_t output_value;
+      output_value.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1)));
+      output_value.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1)));
+      output_value.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2)));
+      output_value.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2)));
+
+      int32x4x4_t temp_val = MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift);
+
+      temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+      temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+      temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
+      temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
+
+      temp_val.val[0] = vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+      temp_val.val[1] = vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+      temp_val.val[2] = vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+      temp_val.val[3] = vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+      StoreValue(temp_val, output_data + j);
+    }
+    for (; j < last_dim; ++j)
+    {
+      const uint8_t index = (input_data_uint[j] ^ offset) + table_offset;
+      const uint8_t part1 = params.uint8_table1[index];
+      const uint8_t part2 = params.uint8_table2[index];
+      const int32_t exp_value = (part1 << 8) + part2;
+      const int32_t output_value = MultiplyByQuantizedMultiplier(exp_value, multiplier, shift);
+
+      output_data[j] = static_cast<Out>(
+        std::max(std::min(clamp_max, output_value + params.zero_point), clamp_min));
     }
+    input_data_uint += last_dim;
+    output_data += last_dim;
   }
 }
+#endif
 
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h
index feeb358c9..aff36e2f3 100644
--- a/compute/cker/include/cker/operation/SpaceToBatchND.h
+++ b/compute/cker/include/cker/operation/SpaceToBatchND.h
@@ -79,9 +79,9 @@ inline void SpaceToBatchND(const SpaceToBatchParams &params, const Shape &unexte
         else
         {
           const T *in =
-              input_data + Offset(input_shape, input_batch,
-                                  (out_h * block_shape_height + shift_h) - padding_top,
-                                  (out_w * block_shape_width + shift_w) - padding_left, 0);
+            input_data + Offset(input_shape, input_batch,
+                                (out_h * block_shape_height + shift_h) - padding_top,
+                                (out_w * block_shape_width + shift_w) - padding_left, 0);
           memcpy(out, in, depth * sizeof(T));
         }
       }
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
index d5952ae23..dcf649ca1 100644
--- a/compute/cker/include/cker/operation/StatelessRandomUniform.h
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -72,8 +72,8 @@ void Fill(random::PhiloxRandom random, Tensor *output)
                                                     Distribution());
 }
 
-inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data,
-                                   const Shape &seed_shape, const int *seed_data,
+inline void StatelessRandomUniform(const Shape &shape_shape, const int32_t *shape_data,
+                                   const Shape &seed_shape, const int32_t *seed_data,
                                    const Shape &output_shape, float *output_data)
 {
   Tensor shape_t;
@@ -95,7 +95,7 @@ inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_da
   GenerateKey(seed_t, &key, &counter);
 
   Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
-      random::PhiloxRandom(counter, key), &output_t);
+    random::PhiloxRandom(counter, key), &output_t);
 }
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/StridedSlice.h b/compute/cker/include/cker/operation/StridedSlice.h
index c57b4daa0..2f1089575 100644
--- a/compute/cker/include/cker/operation/StridedSlice.h
+++ b/compute/cker/include/cker/operation/StridedSlice.h
@@ -260,12 +260,41 @@ template <typename T>
 inline void StridedSlice(const StridedSliceParams &op_params, const Shape &unextended_input_shape,
                          const T *input_data, const Shape &unextended_output_shape, T *output_data)
 {
-  // Note that the output_shape is not used herein.
-  StridedSliceParams params_copy = op_params;
-
   assert(unextended_input_shape.DimensionsCount() <= 4);
   assert(unextended_output_shape.DimensionsCount() <= 4);
 
+  bool optimize = true;
+  int st_count = op_params.strides_count;
+  for (int idx = 0; idx < st_count - 1; idx++)
+  {
+    const int axis_size = unextended_input_shape.Dims(idx);
+    const int start = StartForAxis(op_params, unextended_input_shape, idx);
+    const int stop = StopForAxis(op_params, unextended_input_shape, idx, start);
+    if ((axis_size != 1) && (start != 0 || stop != 0))
+    {
+      optimize = false;
+      break;
+    }
+  }
+
+  if (optimize)
+  {
+    if (op_params.strides[st_count - 1] == 1)
+    {
+      const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1);
+      const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start);
+
+      for (int idx = 0; idx < end - start; idx++)
+      {
+        output_data[idx] = input_data[idx + start];
+      }
+      return;
+    }
+  }
+
+  // Note that the output_shape is not used herein.
+  StridedSliceParams params_copy = op_params;
+
   const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
   const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h
index 1dcdd9b79..42433468a 100644
--- a/compute/cker/include/cker/operation/Tile.h
+++ b/compute/cker/include/cker/operation/Tile.h
@@ -55,7 +55,7 @@ std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_dat
   {
     int stride_size = 0, tiled_stride_size = 0;
     std::tie(stride_size, tiled_stride_size) =
-        TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
+      TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
     copy_from_data += stride_size;
     copy_to_data += tiled_stride_size;
     total_stride_size += stride_size;
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
index 9d8cd340d..52c826c39 100644
--- a/compute/cker/include/cker/operation/Transpose.h
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -288,7 +288,7 @@ size_t Flatten(const Shape &input_shape, const Shape &output_shape, const Transp
   return flat_size;
 }
 
-} // namespace anonymous (util)
+} // namespace
 
 // Transpose2D only deals with typical 2D matrix transpose ops.
 // Perform transpose by transposing 4x4 blocks of the input, proceeding from
@@ -555,9 +555,9 @@ void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_inp
     const int total_size = shrunk_input_shape.FlatSize();
 
     const int non_flatten_size =
-        Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
+      Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
 
-                &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
+              &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
     assert(non_flatten_params.perm[0] != 0);
 
     for (int i = 0; i < total_size; i += non_flatten_size)
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
index 7db3a1179..d41f86047 100644
--- a/compute/cker/include/cker/operation/TransposeConv.h
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -90,11 +90,11 @@ inline void TransposeConv(const TransposeConvParams &params, const Shape &input_
                     (out_y < output_height))
                 {
                   float input_value =
-                      input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
-                                                          filter_x, in_channel)];
+                    input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value =
+                    filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
                   output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
-                      input_value * filter_value;
+                    input_value * filter_value;
                 }
               }
             }
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index ac5069917..1fe3e1517 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -19,6 +19,8 @@
 #define __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__
 
 #include <functional>
+#include <limits>
+#include <utility>
 #include "cker/neon/neon_check.h"
 #include "cker/operation/reference/BinaryArithmeticOps.h"
 #include "cker/Shape.h"
@@ -33,8 +35,9 @@ namespace cker
 namespace optimized
 {
 
+/* Old version: For Sub(float) and Div. */
 template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
-inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs,
                                     const Shape & /* unswitched_input1_shape */,
                                     const T *unswitched_input1_data,
                                     const Shape & /* unswitched_input2_shape */,
@@ -42,11 +45,8 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
                                     const Shape & /* output_shape */, T *output_data,
                                     ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
 {
-  const bool use_unswitched =
-      params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+  const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data;
+  const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data;
 
   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
@@ -123,29 +123,129 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
   }
 }
 
-inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
-                          const uint8_t input2_data)
+// New version: For Mul, Add and Sub(quant8)
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &unswitched_params,
+                                    const Shape & /* unswitched_input1_shape */,
+                                    const T *unswitched_input1_data,
+                                    const Shape & /* unswitched_input2_shape */,
+                                    const T *unswitched_input2_data,
+                                    const Shape & /* output_shape */, T *output_data,
+                                    ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
+{
+  BinaryArithmeticOpParam switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+    unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const BinaryArithmeticOpParam &params = use_unswitched ? unswitched_params : switched_params;
+  const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T *output_data_ptr = output_data;
+  const T *input1_data_ptr = input1_data;
+  const T *input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1)
+  {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          for (int i3 = 0; i3 < y3; ++i3)
+          {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+  else
+  {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_sum(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 {
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
   const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
   const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
   const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    shifted_input1_val, params.input1_multiplier, params.input1_shift);
   const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    shifted_input2_val, params.input2_multiplier, params.input2_shift);
   const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
   const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                 raw_sum, params.output_multiplier, params.output_shift) +
+                               raw_sum, params.output_multiplier, params.output_shift) +
                              params.output_offset;
   const int32_t clamped_output = std::min(params.quantized_activation_max,
                                           std::max(params.quantized_activation_min, raw_output));
   return clamped_output;
 }
 
-inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
-                                 const uint8_t *input1_data, const uint8_t *input2_data,
-                                 uint8_t *output_data)
+inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const uint8_t *input1_data, const uint8_t *input2_data,
+                           uint8_t *output_data)
 {
   int i = 0;
 
@@ -193,9 +293,9 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int16x4_t s1_narrowed = vmovn_s32(s1);
     const int16x4_t s2_narrowed = vmovn_s32(s2);
     const int16x8_t s =
-        vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
-    const uint8x8_t clamped = vmax_u8(output_activation_min_vector,
-                                      vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+      vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+      vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
     vst1_u8(output_data + i, clamped);
   }
 #endif // NEON
@@ -206,12 +306,12 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
     const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, params.input1_multiplier, params.input1_shift);
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
     const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
     const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
     const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                   raw_sum, params.output_multiplier, params.output_shift) +
+                                 raw_sum, params.output_multiplier, params.output_shift) +
                                params.output_offset;
     const int32_t clamped_output = std::min(params.quantized_activation_max,
                                             std::max(params.quantized_activation_min, raw_output));
@@ -220,7 +320,248 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
 }
 
 inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
-                           const float *input1_data, const float *input2_data, float *output_data)
+                           const int8_t *input1_data, const int8_t *input2_data,
+                           int8_t *output_data)
+{
+  int i = 0;
+#ifdef USE_NEON
+  const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup);
+    const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup);
+    const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup);
+    const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 =
+      vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 =
+      vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                 raw_sum, params.output_multiplier, params.output_shift) +
+                               params.output_offset;
+    const int32_t clamped_output = std::min(params.quantized_activation_max,
+                                            std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+struct BinaryOpFuncAddFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vaddq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a + b; }
+};
+
+struct BinaryOpFuncSubFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vsubq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a - b; }
+};
+
+struct BinaryOpFuncMulFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vmulq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a * b; }
+};
+
+struct BinaryOpFuncDivFloat
+{
+#ifdef USE_NEON
+#ifdef __aarch64__
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vdivq_f32(a, b);
+  }
+#endif // __aarch64__
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a / b; }
+};
+
+template <class BASEOPERATOR> struct BinaryOpFuncSwapArgs
+{
+  template <typename T> static inline T calculate(const T &a, const T &b)
+  {
+    return BASEOPERATOR::calculate(b, a);
+  }
+};
+
+struct BinaryOpActivationFloatNone
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    (void)ceilingParam; // suppress unused argument warning
+    return value;
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    (void)floorParam;
+    return value;
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    (void)ceilingParam;
+    return value;
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    (void)floorParam;
+    return value;
+  }
+};
+
+struct BinaryOpActivationFloatMax
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    (void)ceilingParam; // suppress unused argument warning
+    return value;
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    return vmaxq_f32(value, floorParam);
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    (void)ceilingParam;
+    return value;
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    return std::max(value, floorParam);
+  }
+};
+
+struct BinaryOpActivationFloatMinMax
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    return vminq_f32(value, ceilingParam);
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    return vmaxq_f32(value, floorParam);
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    return std::min(value, ceilingParam);
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    return std::max(value, floorParam);
+  }
+};
+
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
+                                const float *input1_data, const float *input2_data,
+                                float *output_data)
 {
   int i = 0;
 
@@ -237,18 +578,18 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
     auto a21 = vld1q_f32(input2_data + i + 4);
     auto a22 = vld1q_f32(input2_data + i + 8);
     auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vaddq_f32(a10, a20);
-    auto x1 = vaddq_f32(a11, a21);
-    auto x2 = vaddq_f32(a12, a22);
-    auto x3 = vaddq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
+    auto x0 = OPERATOR::calculate(a10, a20);
+    auto x1 = OPERATOR::calculate(a11, a21);
+    auto x2 = OPERATOR::calculate(a12, a22);
+    auto x3 = OPERATOR::calculate(a13, a23);
+    x0 = ACTIVATION::applyFloor(x0, activation_min);
+    x1 = ACTIVATION::applyFloor(x1, activation_min);
+    x2 = ACTIVATION::applyFloor(x2, activation_min);
+    x3 = ACTIVATION::applyFloor(x3, activation_min);
+    x0 = ACTIVATION::applyCeiling(x0, activation_max);
+    x1 = ACTIVATION::applyCeiling(x1, activation_max);
+    x2 = ACTIVATION::applyCeiling(x2, activation_max);
+    x3 = ACTIVATION::applyCeiling(x3, activation_max);
     vst1q_f32(output_data + i, x0);
     vst1q_f32(output_data + i + 4, x1);
     vst1q_f32(output_data + i + 8, x2);
@@ -258,26 +599,101 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
   {
     auto a1 = vld1q_f32(input1_data + i);
     auto a2 = vld1q_f32(input2_data + i);
-    auto x = vaddq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
+    auto x = OPERATOR::calculate(a1, a2); // vaddq
+    auto x_clamped =
+      ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+    vst1q_f32(output_data + i, x_clamped);
   }
-#endif // NEON
+#endif // USE_NEON
+  for (; i < size; i++)
+  {
+    auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
+    output_data[i] = ACTIVATION::applyCeiling(
+      ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+  }
+}
+
+// Broadcast binary op template that can often be used for inner loop
+// This function will handle scalar_value (LHS) and vector_values (RHS).
+// Since it's a float function, input params does not matter here.
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+                                    const float broadcast_value, const float *input2_data,
+                                    float *output_data)
+{
+  int i = 0;
+
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(params.float_activation_min);
+  const auto activation_max = vdupq_n_f32(params.float_activation_max);
+  const auto broadcast_value_dup = vdupq_n_f32(broadcast_value);
+  for (; i <= size - 16; i += 16)
+  {
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = OPERATOR::calculate(broadcast_value_dup, a20);
+    auto x1 = OPERATOR::calculate(broadcast_value_dup, a21);
+    auto x2 = OPERATOR::calculate(broadcast_value_dup, a22);
+    auto x3 = OPERATOR::calculate(broadcast_value_dup, a23);
+    x0 = ACTIVATION::applyFloor(x0, activation_min);
+    x1 = ACTIVATION::applyFloor(x1, activation_min);
+    x2 = ACTIVATION::applyFloor(x2, activation_min);
+    x3 = ACTIVATION::applyFloor(x3, activation_min);
+    x0 = ACTIVATION::applyCeiling(x0, activation_max);
+    x1 = ACTIVATION::applyCeiling(x1, activation_max);
+    x2 = ACTIVATION::applyCeiling(x2, activation_max);
+    x3 = ACTIVATION::applyCeiling(x3, activation_max);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4)
+  {
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = OPERATOR::calculate(broadcast_value_dup, a2);
+    auto x_clamped =
+      ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+    vst1q_f32(output_data + i, x_clamped);
+  }
+#endif // USE_NEON
   for (; i < size; i++)
   {
-    auto x = input1_data[i] + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min,
-                                                         params.float_activation_max);
+    auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
+    output_data[i] = ACTIVATION::applyCeiling(
+      ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
   }
 }
 
-inline void AddQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                      const uint8_t *input1_data, const Shape &input2_shape,
-                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+using BinaryOpImplFloatFuncs =
+  std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
+            void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
+
+template <class FUNC>
+inline BinaryOpImplFloatFuncs
+getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam &params)
+{
+  if (params.float_activation_max == std::numeric_limits<float>::max())
+    if (params.float_activation_min == std::numeric_limits<float>::lowest())
+      return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>,
+                                    BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>);
+    else
+      return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>,
+                                    BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>);
+  else
+    return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>,
+                                  BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -285,15 +701,16 @@ inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape
                 const Shape &output_shape, float *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 }
 
 // Scalar-broadcast add that can be used for inner loop of more general
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
-inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
-                                     uint8_t broadcast_value, const uint8_t *input2_data,
-                                     uint8_t *output_data)
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               uint8_t broadcast_value, const uint8_t *input2_data,
+                               uint8_t *output_data)
 {
   int i = 0;
   int32_t clamped_output;
@@ -304,58 +721,115 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
-                               float broadcast_value, const float *input2_data, float *output_data)
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params, int8_t input1_data,
+                               const int8_t *input2_data, int8_t *output_data)
 {
+  using gemmlowp::RoundingDivideByPOT;
   int i = 0;
 #ifdef USE_NEON
-  const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min);
-  const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max);
-  const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
-  for (; i <= size - 4; i += 4)
-  {
-    const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min);
+  const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max);
 
-    const float32x4_t output = vaddq_f32(input2_val_original, broadcast_value_dup);
+  // Process broadcast scalar.
+  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+  const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+  const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
 
-    const float32x4_t clamped =
-        vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output));
-    vst1q_f32(output_data + i, clamped);
+  for (; i <= size - 8; i += 8)
+  {
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+    const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s =
+      vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const int8x8_t clamped =
+      vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+    vst1_s8(output_data + i, clamped);
   }
 #endif // NEON
-  for (; i < size; ++i)
+
+  if (i < size)
   {
-    auto x = broadcast_value + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min,
-                                                         params.float_activation_max);
+    // Process broadcast scalar.
+    const int32_t input1_val = params.input1_offset + input1_data;
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i)
+    {
+      const int32_t input2_val = params.input2_offset + input2_data[i];
+      const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+      const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                   raw_sum, params.output_multiplier, params.output_shift) +
+                                 params.output_offset;
+      const int32_t clamped_output = std::min(
+        params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<int8_t>(clamped_output);
+    }
   }
 }
 
-inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
-                                       const Shape &input1_shape, const uint8_t *input1_data,
-                                       const Shape &input2_shape, const uint8_t *input2_data,
-                                       const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                     const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                     const Shape &output_shape, T *output_data)
 {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
-    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
-                const uint8_t &b) -> uint8_t {
-      return static_cast<uint8_t>(quant8_sum(params, a, b));
-    };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
-                                                     input2_shape, input2_data, output_shape,
-                                                     output_data, fn);
-  }
-  else
-  {
-    BinaryBroadcastFiveFold(
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                             uint8_t *)>(AddElementwiseQuant8),
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                             uint8_t *)>(AddScalarBroadcastQuant8));
+    const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+      [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+        return static_cast<T>(quant8_sum(params, a, b));
+      };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+    return;
   }
+
+  BinaryBroadcastFiveFold(
+    params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+      AddElementwise),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+      AddScalarBroadcast));
 }
 
 inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -366,18 +840,18 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a + b; };
+      [](const float &a, const float &b) -> float { return a + b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
   else
   {
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+
     BinaryBroadcastFiveFold(
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *,
-                             float *)>(AddElementwise),
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>(
-            AddScalarBroadcast));
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      implFuncs.first, implFuncs.second);
   }
 }
 
@@ -385,75 +859,57 @@ inline void Sub(const BinaryArithmeticOpParam &params, const Shape &input1_shape
                 const float *input1_data, const Shape &input2_shape, const float *input2_data,
                 const Shape &output_shape, float *output_data)
 {
-  int i = 0;
-  const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-#ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(params.float_activation_min);
-  const auto activation_max = vdupq_n_f32(params.float_activation_max);
-  for (; i <= size - 16; i += 16)
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
   {
-    auto a10 = vld1q_f32(input1_data + i);
-    auto a11 = vld1q_f32(input1_data + i + 4);
-    auto a12 = vld1q_f32(input1_data + i + 8);
-    auto a13 = vld1q_f32(input1_data + i + 12);
-    auto a20 = vld1q_f32(input2_data + i);
-    auto a21 = vld1q_f32(input2_data + i + 4);
-    auto a22 = vld1q_f32(input2_data + i + 8);
-    auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vsubq_f32(a10, a20);
-    auto x1 = vsubq_f32(a11, a21);
-    auto x2 = vsubq_f32(a12, a22);
-    auto x3 = vsubq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
-    vst1q_f32(output_data + i, x0);
-    vst1q_f32(output_data + i + 4, x1);
-    vst1q_f32(output_data + i + 8, x2);
-    vst1q_f32(output_data + i + 12, x3);
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+    BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
   }
-  for (; i <= size - 4; i += 4)
+  else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
   {
-    auto a1 = vld1q_f32(input1_data + i);
-    auto a2 = vld1q_f32(input2_data + i);
-    auto x = vsubq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
+    auto implFuncs =
+      getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
+    BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
   }
-#endif // NEON
-
-  for (; i < size; i++)
+  else
   {
-    auto x = input1_data[i] - input2_data[i];
-    output_data[i] =
-        ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+    const std::function<float(const float &, const float &)> fn =
+      [](const float &a, const float &b) -> float { return a - b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
   }
 }
 
-inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
-                          const uint8_t input2_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_mul(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 {
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
   const int32_t unclamped_result =
-      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                           params.output_multiplier,
-                                                           params.output_shift);
+    params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                         params.output_multiplier,
+                                                         params.output_shift);
   const int32_t clamped_output = std::min(
-      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
 
   return clamped_output;
 }
 
-inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
-                                 const uint8_t *input1_data, const uint8_t *input2_data,
-                                 uint8_t *output_data)
+inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const uint8_t *input1_data, const uint8_t *input2_data,
+                           uint8_t *output_data)
 {
   int i = 0;
 
@@ -495,8 +951,8 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
     const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped = vmax_u8(output_activation_min_vector,
-                                 vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    const auto clamped =
+      vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
     vst1_u8(output_data + i, clamped);
   }
 #endif // NEON
@@ -506,76 +962,111 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int32_t input1_val = params.input1_offset + input1_data[i];
     const int32_t input2_val = params.input2_offset + input2_data[i];
     const int32_t unclamped_result =
-        params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                             params.output_multiplier,
-                                                             params.output_shift);
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, unclamped_result));
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
     output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
 inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
-                           const float *input1_data, const float *input2_data, float *output_data)
+                           const int8_t *input1_data, const int8_t *input2_data,
+                           int8_t *output_data)
 {
   int i = 0;
-
 #ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(params.float_activation_min);
-  const auto activation_max = vdupq_n_f32(params.float_activation_max);
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
   for (; i <= size - 16; i += 16)
   {
-    auto a10 = vld1q_f32(input1_data + i);
-    auto a11 = vld1q_f32(input1_data + i + 4);
-    auto a12 = vld1q_f32(input1_data + i + 8);
-    auto a13 = vld1q_f32(input1_data + i + 12);
-    auto a20 = vld1q_f32(input2_data + i);
-    auto a21 = vld1q_f32(input2_data + i + 4);
-    auto a22 = vld1q_f32(input2_data + i + 8);
-    auto a23 = vld1q_f32(input2_data + i + 12);
-    auto x0 = vmulq_f32(a10, a20);
-    auto x1 = vmulq_f32(a11, a21);
-    auto x2 = vmulq_f32(a12, a22);
-    auto x3 = vmulq_f32(a13, a23);
-    x0 = vmaxq_f32(activation_min, x0);
-    x1 = vmaxq_f32(activation_min, x1);
-    x2 = vmaxq_f32(activation_min, x2);
-    x3 = vmaxq_f32(activation_min, x3);
-    x0 = vminq_f32(activation_max, x0);
-    x1 = vminq_f32(activation_max, x1);
-    x2 = vminq_f32(activation_max, x2);
-    x3 = vminq_f32(activation_max, x3);
-    vst1q_f32(output_data + i, x0);
-    vst1q_f32(output_data + i + 4, x1);
-    vst1q_f32(output_data + i + 8, x2);
-    vst1q_f32(output_data + i + 12, x3);
-  }
-  for (; i <= size - 4; i += 4)
-  {
-    auto a1 = vld1q_f32(input1_data + i);
-    auto a2 = vld1q_f32(input2_data + i);
-    auto x = vmulq_f32(a1, a2);
-    x = vmaxq_f32(activation_min, x);
-    x = vminq_f32(activation_max, x);
-    vst1q_f32(output_data + i, x);
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+      vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+      vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif // NEON
 
-  for (; i < size; i++)
+  for (; i < size; ++i)
   {
-    auto x = input1_data[i] * input2_data[i];
-    output_data[i] =
-        ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
   }
 }
 
-inline void MulQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                      const uint8_t *input1_data, const Shape &input2_shape,
-                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -583,12 +1074,13 @@ inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape
                 const Shape &output_shape, float *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
-                                     const uint8_t broadcast_value, const uint8_t *input2_data,
-                                     uint8_t *output_data)
+inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               const uint8_t broadcast_value, const uint8_t *input2_data,
+                               uint8_t *output_data)
 {
   int i = 0;
   int32_t clamped_output;
@@ -600,60 +1092,108 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
 }
 
 // Broadcast mul that can often be used for inner loop of broadcast Mul.
-// This function will handle scalar_value (LHS) * vector_values (RHS).
-// Since it's a float function, input params does not matter here.
 inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
-                               const float broadcast_value, const float *input2_data,
-                               float *output_data)
+                               const int8_t broadcast_value, const int8_t *input2_data,
+                               int8_t *output_data)
 {
+  const int16_t input1_val = params.input1_offset + broadcast_value;
+
   int i = 0;
 #ifdef USE_NEON
-  const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min);
-  const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max);
-  const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
-  for (; i <= size - 4; i += 4)
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16)
   {
-    const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+
+    const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
-    const float32x4_t output = vmulq_f32(input2_val_original, broadcast_value_dup);
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
 
-    const float32x4_t clamped =
-        vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output));
-    vst1q_f32(output_data + i, clamped);
+    const int16x8_t p_part1 =
+      vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+      vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif // NEON
 
   for (; i < size; ++i)
   {
-    float x = broadcast_value * input2_data[i];
-    output_data[i] =
-        ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
   }
 }
 
-inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
-                                       const Shape &input1_shape, const uint8_t *input1_data,
-                                       const Shape &input2_shape, const uint8_t *input2_data,
-                                       const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                     const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                     const Shape &output_shape, T *output_data)
 {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
-    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
-                const uint8_t &b) -> uint8_t {
-      return static_cast<uint8_t>(quant8_mul(params, a, b));
-    };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
-                                                     input2_shape, input2_data, output_shape,
-                                                     output_data, fn);
+    const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+      [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+        return static_cast<T>(quant8_mul(params, a, b));
+      };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
     return;
   }
   BinaryBroadcastFiveFold(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                           uint8_t *)>(MulElementwiseQuant8),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                           uint8_t *)>(MulSimpleBroadcastQuant8));
+    params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+      MulElementwise),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+      MulSimpleBroadcast));
 }
 
 inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -665,17 +1205,59 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
   {
     // TODO: Use GetBinaryArithmeticFn
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a * b; };
+      [](const float &a, const float &b) -> float { return a * b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
     return;
   }
-  BinaryBroadcastFiveFold(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *,
-                           float *)>(MulElementwise),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>(
-          MulSimpleBroadcast));
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data,
+                          output_shape, output_data, implFuncs.first, implFuncs.second);
+}
+
+inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                const float *input1_data, const Shape &input2_shape, const float *input2_data,
+                const Shape &output_shape, float *output_data)
+{
+#ifdef __aarch64__
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+#else
+  const std::function<float(const float &, const float &)> fn =
+    [](const float &a, const float &b) -> float { return a / b; };
+  reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
+                                output_shape, output_data, fn);
+#endif // __aarch64__
+}
+
+inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+#ifdef __aarch64__
+  if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
+  {
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+    BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
+  {
+    auto implFuncs =
+      getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
+    BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else
+#endif // __aarch64__
+  {
+    const std::function<float(const float &, const float &)> fn =
+      [](const float &a, const float &b) -> float { return a / b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+  }
 }
 
 } // namespace optimized
diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h
index 0f620146c..6e0e129c6 100644
--- a/compute/cker/include/cker/operation/optimized/Conv.h
+++ b/compute/cker/include/cker/operation/optimized/Conv.h
@@ -42,13 +42,15 @@ namespace cker
 namespace optimized
 {
 
+std::mutex _gemmlowp_mutex;
+
 struct GemmlowpOutputPipeline
 {
   typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
   typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
                      gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
                      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
-      Pipeline;
+    Pipeline;
   static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
                           int32_t output_multiplier, int output_left_shift,
                           int32_t output_activation_min, int32_t output_activation_max)
@@ -106,7 +108,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   const int filter_height = filter_shape.Dims(1);
   const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col =
-      stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+    stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
   if (need_dilated_im2col)
   {
     assert(im2col_data);
@@ -141,7 +143,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   // the other calls commented out. This is a partial rollback of cl/196819423.
   // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
   const int gemm_input_cols =
-      gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
+    gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
   const int filter_rows = filter_shape.Dims(0);
   // See b/79927784.
   // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
@@ -156,17 +158,19 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   assert(bias_shape.FlatSize() == output_rows);
   UNUSED_RELEASE(bias_shape);
   gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, filter_rows, filter_cols);
+    filter_data, filter_rows, filter_cols);
   gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
-      gemm_input_data, gemm_input_rows, gemm_input_cols);
+    gemm_input_data, gemm_input_rows, gemm_input_cols);
   gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
                                                                            output_cols);
   const auto &output_pipeline =
-      GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
-                                      output_shift, output_activation_min, output_activation_max);
+    GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
+                                    output_shift, output_activation_min, output_activation_max);
+
+  std::lock_guard<std::mutex> lock_guard(_gemmlowp_mutex);
   gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
-      output_pipeline);
+    gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
+    output_pipeline);
 }
 
 } // namespace optimized
@@ -202,10 +206,10 @@ public:
                   T *output_data, int output_height, int output_width)
   {
     const bool is_1x1_kernel =
-        (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
+      (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
     const bool is_same_height_width =
-        (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
-         pad_height == 0);
+      (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
+       pad_height == 0);
     if (is_1x1_kernel || is_same_height_width)
     {
       // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
new file mode 100644
index 000000000..17b2fc7a2
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
@@ -0,0 +1,1250 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+
+template <> struct FloatDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters
+        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+        // Multiply-accumulate
+        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4)
+      {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <> struct FloatDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++)
+        {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4)
+      {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2)
+      {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++)
+        {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++)
+    {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++)
+      {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++)
+      {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 4>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 16>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    float32x2_t filter = vld1_f32(filter_ptr);
+    float32x4_t filter_x4 = vcombine_f32(filter, filter);
+    int outp = 0;
+
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      float32x2_t input_1 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x2_t input_2 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x4_t input = vcombine_f32(input_1, input_2);
+
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter_x4);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filter);
+
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    float32x4_t filter = vld1q_f32(filter_ptr);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input = vld1q_f32(input_ptr);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width,
+                                const float *input_data, int pad_width, int depth_multiplier,
+                                int filter_width, const float *filter_data, int out_x_buffer_start,
+                                int out_x_buffer_end, int output_depth, float *acc_buffer)
+{
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped =
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
+      filter_base_ptr, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                              int input_width, const float *input_data,
+                                              int pad_width, int depth_multiplier, int filter_width,
+                                              const float *filter_data, int out_x_buffer_start,
+                                              int out_x_buffer_end, int output_depth,
+                                              float *acc_buffer)
+{
+  const float *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const float *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float *bias_data, float *acc_buffer)
+{
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+                              const float *input_data, const Shape &filter_shape,
+                              const float *filter_data, const Shape &bias_shape,
+                              const float *bias_data, const Shape &output_shape, float *output_data,
+                              int thread_start, int thread_end, int thread_dim)
+{
+  UNUSED_RELEASE(bias_shape);
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(thread_dim == 0 || thread_dim == 1);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  static const int kAccBufferMaxSize = 4832;
+  float acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+
+  UNUSED_RELEASE(kAccBufferActualSize);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+      FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;       \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+  }
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      // Multithread along with the batch axis
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      assert(thread_start >= 0);
+      assert(thread_end <= output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  float *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+
+  for (int b = batch_start; b < batch_end; ++b)
+  {
+    for (int out_y = row_start; out_y < row_end; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+                         out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16)
+        {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++)
+          {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          for (int k = 0; k < 4; k++)
+          {
+            acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                               vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
+          for (int k = 0; k < 4; k++)
+          {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4)
+        {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++)
+        {
+          float acc = acc_buffer[i];
+          acc = std::max(output_activation_min, std::min(output_activation_max, acc));
+
+          *output_ptr++ = acc;
+        }
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
index d383b126d..5ca56fd09 100644
--- a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
@@ -32,6 +32,8 @@ namespace cker
 {
 namespace optimized
 {
+namespace depthwise_conv
+{
 
 // Implementation of quantized DepthwiseConv
 
@@ -44,8 +46,8 @@ struct QuantizedDepthwiseConvKernel
 template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -57,7 +59,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
     for (int i = 0; i < 2; i++)
     {
       filter[i] =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
+        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
     }
     // Handle one output pixel at a time.
     for (int outp = 0; outp < num_output_pixels; outp++)
@@ -80,9 +82,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
       for (int i = 0; i < 2; i++)
       {
         acc[0].val[i] =
-            vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
         acc[1].val[i] =
-            vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 2; i++)
@@ -98,8 +100,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -174,8 +176,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -206,9 +208,9 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
       for (int i = 0; i < 2; i++)
       {
         acc[2 * i + 0] =
-            vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
         acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 4; i++)
@@ -253,8 +255,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -338,8 +340,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -409,8 +411,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -534,8 +536,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -600,8 +602,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -703,8 +705,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -778,8 +780,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -864,8 +866,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -873,7 +875,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
     // We will do that by register-level table-look-up using VTBL instructions.
     // Here we prepare the registers containing the table-lookup indices.
     static const uint8_t dup3_indices_array[3][8] = {
-        {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
     uint8x8_t dup3_indices[3];
     for (int i = 0; i < 3; i++)
     {
@@ -928,9 +930,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
         for (int j = 0; j < 3; j++)
         {
           acc[0].val[j] =
-              vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
           acc[1].val[j] =
-              vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
         }
         // Store the accumulators back to acc_buffer
         for (int i = 0; i < 2; i++)
@@ -944,10 +946,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
       // Handle one input channel at a time.
       for (; ic < input_depth; ic++)
       {
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
         for (int i = 0; i < 3; i++)
         {
-          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
         }
         local_filter_ptr += 3;
@@ -960,8 +962,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1002,9 +1004,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
         for (int j = 0; j < 2; j++)
         {
           acc[0].val[j] =
-              vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
           acc[1].val[j] =
-              vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
         }
         // Store the accumulators back to acc_buffer.
         for (int i = 0; i < 2; i++)
@@ -1018,10 +1020,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
       for (; ic < input_depth; ic++)
       {
         // Load the inputs.
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
         for (int i = 0; i < 2; i++)
         {
-          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
         }
         local_filter_ptr += 2;
@@ -1034,8 +1036,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1112,8 +1114,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
       // Handle one input channel at a time.
       for (; ic < input_depth; ic++)
       {
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
-        const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++ + filter_offset;
         *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
       }
       input_ptr += input_ptr_increment;
@@ -1124,8 +1126,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1174,7 +1176,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
       {
         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
         acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 4; i++)
@@ -1189,8 +1191,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1228,8 +1230,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1253,7 +1255,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc[4];
       for (int i = 0; i < 4; i++)
@@ -1279,8 +1281,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1302,7 +1304,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1338,8 +1340,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1363,7 +1365,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1390,21 +1392,21 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
     // Load the filters, add filter_offset.
     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
     const int16x8_t filter =
-        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+      vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
     // Handle one output pixel at a time.
     for (int outp = 0; outp < num_output_pixels; outp++)
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc[2];
       for (int i = 0; i < 2; i++)
@@ -1427,8 +1429,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
 template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1455,7 +1457,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
       input_ptr += input_ptr_increment;
       const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 
       // Multiply-accumulate.
@@ -1490,8 +1492,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1555,8 +1557,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1652,9 +1654,9 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
       else
       {
         out_x_loop_start_unclampled =
-            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+          (pad_width - dilation_factor * filter_x + stride - 1) / stride;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
       }
     }
     else
@@ -1672,8 +1674,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
-        num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
-        input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+      input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
     filter_base_ptr += output_depth;
   }
 }
@@ -1690,11 +1692,11 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_facto
   const uint8_t *filter_base_ptr = filter_data;
   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
   {
-    const int out_x_loop_start = std::max(
-        out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
     const int out_x_loop_end =
-        std::min(out_x_buffer_end,
-                 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
 
     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
@@ -1813,7 +1815,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
                                  const uint8_t *input_data, const Shape &filter_shape,
                                  const uint8_t *filter_data, const Shape &bias_shape,
                                  const int32_t *bias_data, const Shape &output_shape,
-                                 uint8_t *output_data)
+                                 uint8_t *output_data, int thread_start, int thread_end,
+                                 int thread_dim)
 {
   (void)bias_shape;
   const int stride_width = params.stride_width;
@@ -1852,6 +1855,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
   assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
   assert(kAccBufferActualSize <= kAccBufferMaxSize);
   assert(kOutputPixelsInAccBuffer >= 1);
+  assert(thread_dim == 0 || thread_dim == 1);
+
   UNUSED_RELEASE(kAccBufferActualSize);
 
   // row_accum_func will point to the core accumulation function to be used
@@ -1865,7 +1870,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
       depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
   {                                                                                               \
     row_accum_func =                                                                              \
-        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
   }
 
 #ifdef USE_NEON
@@ -1919,22 +1924,49 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
 
   // Now that we have determined row_accum_func, we can start work.
-  uint8_t *output_ptr = output_data;
-  for (int b = 0; b < batches; ++b)
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      // Multithread along with the batch axis
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      assert(thread_start >= 0);
+      assert(thread_end <= output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  uint8_t *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b)
   {
-    for (int out_y = 0; out_y < output_height; ++out_y)
+    for (int out_y = row_start; out_y < row_end; ++out_y)
     {
       const int in_y_origin = (out_y * stride_height) - pad_height;
       const int filter_y_start =
-          std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
       const int filter_y_end =
-          std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
-                                      dilation_height_factor);
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
            out_x_buffer_start += kOutputPixelsInAccBuffer)
       {
         const int out_x_buffer_end =
-            std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
         // We call a 'pixel' a group of activation that share all but the
         // 'depth'/'channel' coordinate. num_output_pixels is the number of
         // output pixels that we will accumulate in this loop iteration.
@@ -1952,7 +1984,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
                          filter_data + filter_y * filter_height_stride, filter_offset,
                          out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
         }
-        // Finished accumulating int32 values. Now need to convert them to
+        // Finished accumulating int32_t values. Now need to convert them to
         // the final 8bit form and store them.
         const int num_output_values = output_depth * num_output_pixels;
         int i = 0;
@@ -2113,9 +2145,111 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
         }
       }
     }
+    output_ptr += batch_step;
   }
 }
 
+} // namespace depthwise_conv
+
+// template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,
+                                      const uint8_t *input_data, const Shape &filter_shape,
+                                      const uint8_t *filter_data, const Shape &bias_shape,
+                                      const int32_t *bias_data, const Shape &output_shape,
+                                      uint8_t *output_data, int thread_start, int thread_end,
+                                      int thread_dim)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(output_activation_min <= output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  UNUSED_RELEASE(depth_multiplier);
+  UNUSED_RELEASE(output_activation_min);
+  UNUSED_RELEASE(output_activation_max);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(input_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+//  TODO Use below codes
+//  // Dispatch to dot-product 3x3 kernels when supported.
+//
+//  ruy::Context *ruy_context = cpu_backend_context->ruy_context();
+//  const bool has_dot_product_instructions =
+//      ruy_context != nullptr &&
+//      (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+//  if (has_dot_product_instructions)
+//  {
+//    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+//    DotProduct3x3KernelType kernel_type =
+//    optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+//        input_shape, filter_shape, params);
+//    if (kernel_type != DotProduct3x3KernelType::kNone)
+//    {
+//      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+//          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
+//                                                              filter_shape, filter_data,
+//                                                              bias_shape,
+//                                                              bias_data, output_shape,
+//                                                              output_data);
+//      return;
+//    }
+//  }
+//
+//  // Dispatch to non-dot-product 3x3 kernels when supported.
+//
+//  const int stride_width = params.stride_width;
+//  const int stride_height = params.stride_height;
+//  const int pad_width = params.padding_values.width;
+//  const int pad_height = params.padding_values.height;
+//  const int output_shift = params.output_shift;
+//
+//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+//  // parameters are supported.
+//  if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
+//                                                   stride_height, dilation_width_factor,
+//                                                   dilation_height_factor, pad_width, pad_height,
+//                                                   depth_multiplier, output_shape, output_shift))
+//  {
+//    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+//        params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+//        output_shape, output_data, thread_start, thread_end, thread_dim);
+//    return;
+//  }
+#endif
+
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
+                                       bias_shape, bias_data, output_shape, output_data,
+                                       thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+                              const uint8_t *input_data, const Shape &filter_shape,
+                              const uint8_t *filter_data, const Shape &bias_shape,
+                              const int32_t *bias_data, const Shape &output_shape,
+                              uint8_t *output_data, int thread_start, int thread_end,
+                              int thread_dim)
+{
+  return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
+                                   bias_shape, bias_data, output_shape, output_data, thread_start,
+                                   thread_end, thread_dim);
+}
+
 } // namespace optimized
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/optimized/Gemm.h b/compute/cker/include/cker/operation/optimized/Gemm.h
new file mode 100644
index 000000000..cfebef452
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/Gemm.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_GEMM_H__
+#define __NNFW_CKER_OPTIMIZED_GEMM_H__
+
+#include "cker/eigen/eigen_gemm_eigen.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <ruy/context.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+#if defined(CKER_X86_PLATFORM)
+
+/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_x86.h */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct GemmImplX86
+{
+  static void Run(const MatrixParams<LhsScalar> &, const LhsScalar *,
+                  const MatrixParams<RhsScalar> &, const RhsScalar *,
+                  const MatrixParams<DstScalar> &, DstScalar *,
+                  const GemmParams<AccumScalar, DstScalar, quantization_flavor> &)
+  {
+    static_assert(
+      std::is_floating_point<LhsScalar>::value && std::is_floating_point<RhsScalar>::value &&
+        std::is_floating_point<AccumScalar>::value && std::is_floating_point<DstScalar>::value &&
+        quantization_flavor != QuantizationFlavor::kFloatingPoint,
+      "GemmImplX86 does not supported types other than float yet.");
+  }
+};
+
+// For float, defer to eigen for now.
+template <> struct GemmImplX86<float, float, float, float, QuantizationFlavor::kFloatingPoint>
+{
+  static void Run(const MatrixParams<float> &lhs_params, const float *lhs_data,
+                  const MatrixParams<float> &rhs_params, const float *rhs_data,
+                  const MatrixParams<float> &dst_params, float *dst_data,
+                  const GemmParams<float, float, QuantizationFlavor::kFloatingPoint> &params)
+  {
+    detail::GemmImplUsingEigen::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params,
+                                    dst_data, params);
+  }
+};
+
+/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm.h */
+/* GEMM dispatch implementation for x86.
+ */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct GemmImpl : GemmImplX86<LhsScalar, RhsScalar, AccumScalar, DstScalar, quantization_flavor>
+{
+};
+
+/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm.h */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+void Gemm(const MatrixParams<LhsScalar> &lhs_params, const LhsScalar *lhs_data,
+          const MatrixParams<RhsScalar> &rhs_params, const RhsScalar *rhs_data,
+          const MatrixParams<DstScalar> &dst_params, DstScalar *dst_data,
+          const GemmParams<AccumScalar, DstScalar, quantization_flavor> &params)
+{
+  // Generic case: dispatch to any backend as a general GEMM.
+  GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar, quantization_flavor>::Run(
+    lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data, params);
+}
+
+// From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_params.h
+inline CachePolicy DefaultCachePolicy(bool is_constant_data)
+{
+  return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup : CachePolicy::kNeverCache;
+}
+#endif // CKER_X86_PLATFORM
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_GEMM_H__
diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
index ae1f9e78e..f5edc94ab 100644
--- a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
+++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
@@ -111,7 +111,7 @@ inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h,
   {
     const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
     const int bottom_start =
-        output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+      output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
     memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
   }
 }
@@ -159,7 +159,7 @@ void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *
   for (int batch = 0; batch < batches; ++batch)
   {
     const T zero_byte =
-        zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+      zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
     for (int out_y = 0; out_y < output_height; ++out_y)
     {
       for (int out_x = 0; out_x < output_width; ++out_x)
diff --git a/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
new file mode 100644
index 000000000..bd8497920
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+
+#include "cker/CpuBackendThreadpool.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+#include "cker/operation/Quantize.h"
+
+#include <fixedpoint/fixedpoint.h>
+#include <public/gemmlowp.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized_integer_ops
+{
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding
+{
+  kNone = 0,     // Invalid: specific method must be specified.
+  kAwayFromZero, // Original method: exact halves rounded away from zero.
+  kUpward,       // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication
+{
+  kNoMultiplication = 0, // Depth multiplier = 1.
+  kUnitInputDepth,       // Input depth = 1, output depth = depth multiplier.
+};
+
+namespace depthwise_conv
+{
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8x2_t filter_s8;
+    filter_s8.val[0] = vld1_s8(filter_ptr);
+    filter_s8.val[1] = vld1_s8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8.val[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[0].val[i] =
+          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] =
+          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] =
+          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] =
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const int8_t dup3_indices_array[3][8] = {
+      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+    int8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++)
+    {
+      dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        int16x8_t filter[3];
+        int8x8x3_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++)
+        {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+
+        int8x8_t input_s8_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++)
+        {
+          acc[0].val[j] =
+            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+          acc[1].val[j] =
+            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++)
+        {
+          *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        int16x8_t filter[2];
+        int8x8x2_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++)
+        {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++)
+        {
+          acc[0].val[j] =
+            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] =
+            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs.
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++)
+        {
+          *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters.
+        int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
+        int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+        int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+        // Load the inputs, add input_offset.
+        int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
+        int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vmovl_s8(input_s8_0);
+        int16x8_t input_1 = vmovl_s8(input_s8_1);
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter = vmovl_s8(filter_s8);
+        // Load the inputs, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
+        acc[2 * i + 1] =
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
+    int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_2 = vmovl_s8(filter_s8_2);
+    int16x8_t filter_3 = vmovl_s8(filter_s8_3);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_s8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_s8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_s8_x are the same
+    // as the last 4 bytes of filter_s8_x.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_x = vmovl_s8(filter_s8_x);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int16x4_t input_s16 = vdup_n_s16(0);
+      input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 0);
+      input_ptr += input_ptr_increment;
+      input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 1);
+      input_ptr += input_ptr_increment;
+      input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    if (num_output_pixels <= 0)
+    {
+      return;
+    }
+
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    int8x8_t input_s8 = vdup_n_s8(0);
+    input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+    input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+    input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+    input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+    const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8_0 = vld1_s8(input_ptr);
+      int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vmovl_s8(input_s8_0);
+      int16x8_t input_1 = vmovl_s8(input_s8_1);
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
+                                    int input_width, const int8_t *input_data, int16_t input_offset,
+                                    int pad_width, int depth_multiplier, int filter_width,
+                                    const int8_t *filter_data, int out_x_buffer_start,
+                                    int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
+{
+  // Consistency check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const int8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped =
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+      input_ptr_increment, filter_base_ptr, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                                  int input_width, const int8_t *input_data,
+                                                  int16_t input_offset, int pad_width,
+                                                  int depth_multiplier, int filter_width,
+                                                  const int8_t *filter_data, int out_x_buffer_start,
+                                                  int out_x_buffer_end, int output_depth,
+                                                  int32_t *acc_buffer)
+{
+  const int8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const int8_t *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const int16_t input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const int16_t filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32_t *bias_data, int32_t *acc_buffer)
+{
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1)
+  {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16)
+    {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  }
+  else if (output_depth == 2)
+  {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8)
+    {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  }
+  else if (output_depth == 4)
+  {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  }
+  else if (output_depth == 8)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  }
+  else if (output_depth == 16)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(const DepthwiseConvParams &params,
+                                 const int32_t *output_multiplier, const int32_t *output_shift,
+                                 const Shape &input_shape, const int8_t *input_data,
+                                 const Shape &filter_shape, const int8_t *filter_data,
+                                 const Shape & /* bias_shape */, const int32_t *bias_data,
+                                 const Shape &output_shape, int8_t *output_data, int thread_start,
+                                 int thread_end, int thread_dim)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_rows = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  static const int kAccBufferMaxSize = 2048;
+  int32_t acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  UNUSED_RELEASE(kAccBufferActualSize);
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+  assert(thread_dim == 0 || thread_dim == 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_rows;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      assert(thread_start >= 0);
+      assert(thread_end <= output_rows);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  int8_t *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_rows + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b)
+  {
+    for (int out_y = row_start; out_y < row_end; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         input_offset, pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+                         out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32_t values. Now need to convert them to
+        // the final 8bit form and store them.
+        const int num_output_values = output_depth * num_output_pixels;
+
+        Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset,
+                 output_activation_min, output_activation_max, acc_buffer, output_ptr);
+
+        output_ptr += num_output_values;
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+} // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params,
+                                      const int32_t *output_multiplier, const int32_t *output_shift,
+                                      const Shape &input_shape, const int8_t *input_data,
+                                      const Shape &filter_shape, const int8_t *filter_data,
+                                      const Shape &bias_shape, const int32_t *bias_data,
+                                      const Shape &output_shape, int8_t *output_data,
+                                      int thread_start, int thread_end, int thread_dim)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  UNUSED_RELEASE(depth_multiplier);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(input_depth);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+//  TODO Use below codes
+#if 0
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
+
+  // Dispatch to dot-product 3x3 kernels when supported.
+  if (has_dot_product_instructions)
+  {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+      optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+      input_shape, filter_shape, output_shape, params, output_shift);
+    if (kernel_type != DotProduct3x3KernelType::kNone)
+    {
+      DepthwiseConvParams params_copy = params;
+      params_copy.output_shift_per_channel = output_shift;
+      params_copy.output_multiplier_per_channel = output_multiplier;
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
+        DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+        params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+        output_shape, output_data, thread_start, thread_end, thread_dim);
+      return;
+    }
+  }
+
+#endif
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+        optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+        input_shape, filter_shape, stride_width, stride_height, dilation_width_factor,
+        dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0,
+        output_shift))
+  {
+    optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+      DepthwiseConvOutputRounding::kUpward>(
+      params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+      bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+    return;
+  }
+#endif
+
+#endif /* end of if 0 */
+
+  depthwise_conv::DepthwiseConvGeneral(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+                              const int32_t *output_shift, const Shape &input_shape,
+                              const int8_t *input_data, const Shape &filter_shape,
+                              const int8_t *filter_data, const Shape &bias_shape,
+                              const int32_t *bias_data, const Shape &output_shape,
+                              int8_t *output_data, int thread_start, int thread_end, int thread_dim)
+{
+  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
+{
+  DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+                          const int32_t *output_shift, const Shape &input_shape,
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, int thread_start, int thread_end, int thread_dim)
+    : params_(params), output_multiplier_(output_multiplier), output_shift_(output_shift),
+      input_shape_(input_shape), input_data_(input_data), filter_shape_(filter_shape),
+      filter_data_(filter_data), bias_shape_(bias_shape), bias_data_(bias_data),
+      output_shape_(output_shape), output_data_(output_data), thread_start_(thread_start),
+      thread_end_(thread_end), thread_dim_(thread_dim)
+  {
+  }
+
+  void Run() override
+  {
+    DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_, input_data_,
+                      filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
+                      output_data_, thread_start_, thread_end_, thread_dim_);
+  }
+
+private:
+  const DepthwiseConvParams &params_;
+  const int32_t *output_multiplier_;
+  const int32_t *output_shift_;
+  const Shape &input_shape_;
+  const T *input_data_;
+  const Shape &filter_shape_;
+  const T *filter_data_;
+  const Shape &bias_shape_;
+  const TS *bias_data_;
+  const Shape &output_shape_;
+  T *output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape, int thread_dim)
+{
+  constexpr int kMinMulPerThread = 8;
+  const int output_units = output_shape.Dims(thread_dim);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_mul_per_unit =
+    FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+  int thread_count = output_units / min_units_per_thread;
+  return thread_count;
+}
+
+inline void DepthwiseConvPerChannel(const DepthwiseConvParams &params,
+                                    const int32_t *output_multiplier, const int32_t *output_shift,
+                                    const Shape &input_shape, const int8_t *input_data,
+                                    const Shape &filter_shape, const int8_t *filter_data,
+                                    const Shape &bias_shape, const int32_t *bias_data,
+                                    const Shape &output_shape, int8_t *output_data,
+                                    ruy::Context *ruy_context)
+{
+  UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape,
+             filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context);
+
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row)
+  {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  }
+  else
+  {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  // NOTE Borrow RuyContext to get max_num_threads setting
+  // TODO Define and use max_num_threads for CPU backend
+  const int max_threads = ruy_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1)
+  {
+    DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data,
+                      filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data,
+                      /*thread_start=*/0,
+                      /*thread_end=*/output_rows, /*thread_dim=*/1);
+  }
+  else
+  {
+    std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i)
+    {
+      int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data,
+                         filter_shape, filter_data, bias_shape, bias_data, output_shape,
+                         output_data, thread_start, thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
+  }
+}
+
+} // namespace optimized_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h
index e8ffd4014..1b3020de2 100644
--- a/compute/cker/include/cker/operation/reference/BatchMatMul.h
+++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h
@@ -87,9 +87,8 @@ inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Sha
       {
         const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
         const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
-        float *out_ptr =
-            output_data +
-            ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols;
+        float *out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+                                         lhs_rows * rhs_cols;
         for (int j = 0; j < rhs_cols; ++j)
         {
           for (int i = 0; i < lhs_rows; ++i)
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
index f7e39248c..96e1d9127 100644
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -56,28 +56,22 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
   const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < size; i++)
   {
-    output_data[i] =
-        ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
-                                     params.float_activation_min, params.float_activation_max);
+    output_data[i] = ActivationFunctionWithMinMax(
+      fn(input1_data[i], input2_data[i]), params.float_activation_min, params.float_activation_max);
   }
 }
 
 template <typename T>
-inline void BroadcastBinaryArithmeticOpSlowQuant8(
-    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
-    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
-    const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
+inline typename std::enable_if_t<is_quant8<T>::value> BroadcastBinaryArithmeticOpSlow(
+  const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+  const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
+  const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
 {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
   const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
 
-  if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255))
-  {
-    throw std::runtime_error{"Support only for Quant8."};
-  }
-
   // Comment from tensorflow lite:
   //
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
@@ -99,11 +93,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
       {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax<uint8_t>(
-                  fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                     input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-                  params.quantized_activation_min, params.quantized_activation_max);
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
+            fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+               input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+            params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
@@ -143,9 +136,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
           output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
-              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.quantized_activation_min, params.quantized_activation_max);
+            fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+               input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+            params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
@@ -154,9 +147,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
 
 template <>
 inline void BroadcastBinaryArithmeticOpSlow(
-    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
-    const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
-    float *output_data, const std::function<float(const float &, const float &)> &fn)
+  const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+  const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+  float *output_data, const std::function<float(const float &, const float &)> &fn)
 {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
@@ -171,10 +164,10 @@ inline void BroadcastBinaryArithmeticOpSlow(
       {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
-          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
-              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.float_activation_min, params.float_activation_max);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            ActivationFunctionWithMinMax(fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                                            input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+                                         params.float_activation_min, params.float_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
index 86e8b5143..e316083a5 100644
--- a/compute/cker/include/cker/operation/reference/Conv.h
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -98,8 +98,8 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const float
             bias_value = bias_data[out_channel];
           }
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
-                                           output_activation_max);
+            ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+                                         output_activation_max);
         }
       }
     }
@@ -183,7 +183,213 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8_t>(acc);
+            static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, bool is_asymmetric>
+inline void Conv(const ConvParams &params, const int32_t *output_multiplier,
+                 const int32_t *output_shift, const Shape &input_shape, const T *input_data,
+                 const Shape &filter_shape, const T *filter_data, const int32_t *filter_zeropoint,
+                 const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape,
+                 T *output_data)
+
+{
+  UNUSED_RELEASE(bias_shape);
+  // Get parameters.
+  const int32_t input_offset = params.input_offset; // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  assert(output_activation_min < output_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    assert(bias_shape.FlatSize() == output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                const T input_val = input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                const T filter_val =
+                  filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                if (is_asymmetric)
+                {
+                  const int32_t filter_offset = -filter_zeropoint[out_channel];
+                  acc += (filter_val + filter_offset) * (input_val + input_offset);
+                }
+                else
+                {
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val + input_offset);
+                  UNUSED_RELEASE(filter_zeropoint);
+                }
+              }
+            }
+          }
+
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_channel],
+                                              output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = static_cast<T>(acc);
+        }
+      }
+    }
+  }
+}
+
+// Slightly modified from tflite 2.13.0 HybridConvPerChannel
+// im2col and im2col_shape are removed since it is not used in reference kernel.
+inline void HybridConvPerChannel(const ConvParams &params, float *scaling_factors_ptr,
+                                 const Shape &input_shape, const int8_t *input_data,
+                                 const Shape &filter_shape, const int8_t *filter_data,
+                                 const Shape &bias_shape, const float *bias_data,
+                                 const Shape &output_shape, float *output_data,
+                                 const float *per_channel_scale, const int32_t *input_offset)
+
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    assert(bias_shape.FlatSize() == output_depth);
+    UNUSED_RELEASE(bias_shape);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  assert(input_depth % filter_input_depth == 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          auto group = out_channel / filters_per_group;
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              for (int in_channel = 0; in_channel < filter_input_depth; ++in_channel)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+                {
+                  int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x,
+                                                        in_channel + group * filter_input_depth)];
+                  int32_t filter_val =
+                    filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+          }
+          float acc_float = acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch];
+          if (bias_data)
+          {
+            acc_float += bias_data[out_channel];
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+            ActivationFunctionWithMinMax(acc_float, output_activation_min, output_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h
new file mode 100644
index 000000000..9fc58ad3b
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__
+#define __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference_integer_ops
+{
+
+inline void DepthwiseConvHybridPerChannel(const DepthwiseConvParams &params,
+                                          float *scaling_factors_ptr, const Shape &input_shape,
+                                          const int8_t *input_data, const Shape &filter_shape,
+                                          const int8_t *filter_data, const Shape &bias_shape,
+                                          const float *bias_data, const Shape &output_shape,
+                                          float *output_data, const float *per_channel_scale,
+                                          int32_t *input_offset)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  // Check dimensions of the tensors.
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int bias_depth = bias_shape.FlatSize();
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(bias_shape);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_depth == output_depth);
+
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+        {
+          for (int m = 0; m < depth_multiplier; ++m)
+          {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+                if (is_point_inside_image)
+                {
+                  int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
+                    filter_data[Offset(filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+            float acc_float = static_cast<float>(acc);
+            acc_float *= per_channel_scale[output_channel] * scaling_factors_ptr[batch];
+            if (bias_data && output_channel < bias_depth)
+            {
+              acc_float += bias_data[output_channel];
+            }
+            output_data[Offset(output_shape, batch, out_y, out_x, output_channel)] =
+              ActivationFunctionWithMinMax(acc_float, output_activation_min, output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace reference_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__
diff --git a/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h
new file mode 100644
index 000000000..025e40705
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__
+#define __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference_integer_ops
+{
+inline void DepthwiseConvPerChannel(const DepthwiseConvParams &params,
+                                    const int32_t *output_multiplier, const int32_t *output_shift,
+                                    const Shape &input_shape, const uint8_t *input_data,
+                                    const Shape &filter_shape, const uint8_t *filter_data,
+                                    const int32_t *filter_zeropoint, const Shape &bias_shape,
+                                    const int32_t *bias_data, const Shape &output_shape,
+                                    uint8_t *output_data)
+{
+  // Get parameters.
+  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  assert(output_activation_min <= output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(bias_shape);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+        {
+          for (int m = 0; m < depth_multiplier; ++m)
+          {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+                if (is_point_inside_image)
+                {
+                  uint8_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                  uint8_t filter_val =
+                    filter_data[Offset(filter_shape, 0, filter_y, filter_x, output_channel)];
+
+                  // { for per-channel
+                  // NOTE: The following comment is copied from tflite int8 implementation
+                  //       It may not be 100% true for uint8 per-channel.
+                  //
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8, even though it
+                  // is represented using int32_t.
+                  // int32 += int8 * (int8 - int8) so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  const int32_t filter_offset = -filter_zeropoint[output_channel];
+                  acc += (filter_val + filter_offset) * (input_val + input_offset);
+                  // } for per-channel
+                }
+              }
+            }
+            if (bias_data)
+            {
+              acc += bias_data[output_channel];
+            }
+            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[output_channel],
+                                                output_shift[output_channel]);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            // For q8u per-channel, int8_t -> uint8_t
+            output_data[Offset(output_shape, batch, out_y, out_x, output_channel)] =
+              static_cast<uint8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace reference_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 9612dd517..14489a804 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -19,7 +19,9 @@
 #define __NNFW_CKER_RUY_RUY_SUPPORT_H__
 
 #include <util/ConfigSource.h>
-#include <ruy/context.h>
+#include <ruy/matrix.h>
+#include <ruy/ruy.h>
+#include <cassert>
 #include "cker/Types.h"
 
 namespace nnfw
@@ -29,44 +31,66 @@ namespace cker
 namespace ruy_support
 {
 
+inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy)
+{
+  switch (cache_policy)
+  {
+    case CachePolicy::kNeverCache:
+      return ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ruy::CachePolicy::kAlwaysCache;
+    default:
+      assert(false);
+      return ruy::CachePolicy::kNeverCache;
+  }
+}
+
 template <typename Scalar, typename DataPointer>
 void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
-                   ruy::Matrix<Scalar> *dst)
+                   ruy::Matrix<Scalar> *dst, bool use_caching = false)
 {
-  dst->layout.rows = params.rows;
-  dst->layout.cols = params.cols;
-  if (params.order == Order::kColMajor)
+  ruy::Order ruy_order =
+    params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
+  ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
+  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+  // It does care whether we assign to it a Scalar* or a const Scalar*.
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+  if (use_caching)
   {
-    dst->layout.order = ruy::Order::kColMajor;
-    dst->layout.stride = params.rows;
+    dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
   }
-  else
+}
+
+// Integer-quantized case with destination type narrower than int32
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+void MakeRuyMulParams(const GemmParams<std::int32_t, DstScalar, quantization_flavor> &params,
+                      ruy::MulParams<std::int32_t, DstScalar> *ruy_mul_params)
+{
+  static_assert(sizeof(DstScalar) < sizeof(std::int32_t), "");
+  if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier)
   {
-    dst->layout.order = ruy::Order::kRowMajor;
-    dst->layout.stride = params.cols;
+    ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+    ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
   }
-  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
-  // It does care whether we assign to it a Scalar* or a const Scalar*.
-  dst->data = data_ptr;
-  dst->zero_point = params.zero_point;
-  dst->cacheable = params.cacheable;
+  if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier)
+  {
+    ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel);
+    ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel);
+  }
+  ruy_mul_params->set_bias(params.bias);
+  ruy_mul_params->set_clamp_min(params.clamp_min);
+  ruy_mul_params->set_clamp_max(params.clamp_max);
 }
 
-template <typename GemmParamsType, typename RuySpecType>
-void MakeRuySpec(const GemmParamsType &params, RuySpecType *ruy_spec)
+// Raw-integer case with destination type int32.
+template <QuantizationFlavor quantization_flavor>
+void MakeRuyMulParams(const GemmParams<std::int32_t, std::int32_t, quantization_flavor> &params,
+                      ruy::MulParams<std::int32_t, std::int32_t> *ruy_mul_params)
 {
-  // This validation has already been performed by the Gemm API entry point,
-  // but it doesn't hurt to test specifically this again here, where it's
-  // being used.
-  ValidateGemmParams(params);
-
-  ruy_spec->multiplier_fixedpoint = params.multiplier_fixedpoint;
-  ruy_spec->multiplier_exponent = params.multiplier_exponent;
-  ruy_spec->multiplier_fixedpoint_perchannel = params.multiplier_fixedpoint_perchannel;
-  ruy_spec->multiplier_exponent_perchannel = params.multiplier_exponent_perchannel;
-  ruy_spec->bias = params.bias;
-  ruy_spec->clamp_min = params.clamp_min;
-  ruy_spec->clamp_max = params.clamp_max;
+  ruy_mul_params->set_bias(params.bias);
 }
 
 } // namespace ruy_support
diff --git a/compute/cker/include/cker/train/operation/FullyConnected.h b/compute/cker/include/cker/train/operation/FullyConnected.h
new file mode 100644
index 000000000..b0255d287
--- /dev/null
+++ b/compute/cker/include/cker/train/operation/FullyConnected.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_OPERATION_FULLY_CONNECTED_H__
+#define __NNFW_CKER_TRAIN_OPERATION_FULLY_CONNECTED_H__
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+template <typename T>
+inline void FullyConnectedBiasGrad(const Shape &incomming_shape, const T *incomming_data,
+                                   const Shape &grad_shape, T *grad_data)
+{
+  const auto bias_size = grad_shape.FlatSize();
+  if (bias_size != incomming_shape.Dims(incomming_shape.DimensionsCount() - 1) ||
+      bias_size != grad_shape.Dims(0))
+    throw std::runtime_error("cker::FullyConnectedBiasGrad: Unmatched shape");
+
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(incomming_data, incomming_shape);
+  auto grad_mat = MapAsMatrixWithLastDimAsRows(grad_data, grad_shape);
+
+  grad_mat = in_mat.rowwise().sum();
+}
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FULLY_CONNECTED_H__
diff --git a/compute/cker/include/cker/train/operation/Loss.h b/compute/cker/include/cker/train/operation/Loss.h
new file mode 100644
index 000000000..94f49ff07
--- /dev/null
+++ b/compute/cker/include/cker/train/operation/Loss.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_OPERATION_LOSS_H__
+#define __NNFW_CKER_TRAIN_OPERATION_LOSS_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+template <typename T>
+inline void MSE(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape,
+                const T *y_true_data, const Shape &output_shape, T *output_data)
+{
+  // TODO Consider Reduction
+  if (output_shape != Shape{1})
+    throw std::runtime_error("cker::MSE: output_shape != Shape{1}");
+  if (y_pred_shape != y_true_shape)
+    throw std::runtime_error("cker::MSE: y_pred_shape != y_true_shape");
+
+  const auto y_pred = MapAsMatrixWithLastDimAsRows(y_pred_data, y_pred_shape);
+  const auto y_true = MapAsMatrixWithLastDimAsRows(y_true_data, y_true_shape);
+
+  double squared_sum = 0.0f;
+  for (size_t c = 0; c < (size_t)y_pred.cols(); ++c)
+  {
+    for (size_t r = 0; r < (size_t)y_pred.rows(); ++r)
+    {
+      double error = y_pred.coeff(r, c) - y_true.coeff(r, c);
+      squared_sum += (error * error);
+    }
+  }
+
+  auto size = y_pred.cols() * y_pred.rows();
+  output_data[0] = (T)(squared_sum / size);
+}
+
+template <typename T>
+inline void MSEGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape,
+                    const T *y_true_data, const Shape &grad_shape, T *grad_data)
+{
+  if (y_pred_shape != y_true_shape)
+    throw std::runtime_error("cker::MSEGrad: y_pred_shape != y_true_shape");
+  if (y_pred_shape != grad_shape)
+    throw std::runtime_error("cker::MSEGrad: y_pred_shape != grad_shape");
+
+  const int size = grad_shape.FlatSize();
+  for (int i = 0; i < size; ++i)
+  {
+    grad_data[i] = static_cast<T>(-2 * (y_true_data[i] - y_pred_data[i]) / size);
+  }
+}
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRAIN_OPERATION_LOSS_H__
diff --git a/compute/cker/include/cker/train/operation/ReLU.h b/compute/cker/include/cker/train/operation/ReLU.h
new file mode 100644
index 000000000..32cf7fa9c
--- /dev/null
+++ b/compute/cker/include/cker/train/operation/ReLU.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_OPERATION_RELU_H__
+#define __NNFW_CKER_TRAIN_OPERATION_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+inline void ReLUGrad(const Shape &output_shape, const float *output_data,
+                     const Shape &incoming_shape, const float *incoming_data,
+                     const Shape &grad_shape, float *grad_data)
+{
+  const auto output_map = MapAsVector(output_data, output_shape);
+  const auto incoming_map = MapAsVector(incoming_data, incoming_shape);
+  auto grad_map = MapAsVector(grad_data, grad_shape);
+
+  if (output_shape == incoming_shape && output_shape == grad_shape)
+    grad_map.array() = incoming_map.array() * (output_map.array() > 0.0f).template cast<float>();
+  else
+    throw std::runtime_error("cker::ReLUGrad: Unsupported shape");
+}
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRAIN_OPERATION_RELU_H__
diff --git a/compute/cker/src/Range.test.cc b/compute/cker/src/Range.test.cc
new file mode 100644
index 000000000..e5fe4801f
--- /dev/null
+++ b/compute/cker/src/Range.test.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/operation/Range.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, Range)
+{
+  {
+    const int start = 0;
+    const int limit = 10;
+    const int delta = 1;
+    std::vector<int> actual(10);
+    nnfw::cker::Range<int>(&start, &limit, &delta, actual.data());
+
+    for (int i = 0; i < actual.size(); i++)
+      ASSERT_EQ(actual[i], i);
+  }
+
+  {
+    const int start = 3;
+    const int limit = 18;
+    const int delta = 3;
+    std::vector<int> expected = {3, 6, 9, 12, 15};
+    std::vector<int> actual(expected.size());
+    nnfw::cker::Range<int>(&start, &limit, &delta, actual.data());
+
+    for (int i = 0; i < actual.size(); i++)
+      ASSERT_EQ(actual[i], expected[i]);
+  }
+
+  {
+    const float start = 3;
+    const float limit = 1;
+    const float delta = -0.5;
+    std::vector<float> expected = {3, 2.5, 2, 1.5};
+    std::vector<float> actual(expected.size());
+    nnfw::cker::Range<float>(&start, &limit, &delta, actual.data());
+
+    for (int i = 0; i < actual.size(); i++)
+      ASSERT_FLOAT_EQ(actual[i], expected[i]);
+  }
+}
+
+TEST(CKer_Operation, neg_Range)
+{
+  {
+    const int start = 212;
+    const int limit = 10;
+    const int delta = 1;
+    std::vector<int> actual(10);
+
+    EXPECT_ANY_THROW(nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()));
+  }
+}
diff --git a/compute/cker/src/train/FullyConnected.test.cc b/compute/cker/src/train/FullyConnected.test.cc
new file mode 100644
index 000000000..37c2d4a97
--- /dev/null
+++ b/compute/cker/src/train/FullyConnected.test.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/train/operation/FullyConnected.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, FullyConnectedBiasGrad)
+{
+  {
+    // Shape: {2, 4}
+    std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8};
+    // Shape: {4}
+    std::vector<float> expected_bias_backward = {4, -4, -10, 12};
+    std::vector<float> bias_backward(4);
+
+    nnfw::cker::train::FullyConnectedBiasGrad(
+      nnfw::cker::Shape{2, 4}, incoming_backward.data(),
+      nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data());
+
+    for (size_t i = 0; i < bias_backward.size(); ++i)
+      ASSERT_EQ(bias_backward[i], expected_bias_backward[i]);
+  }
+
+  {
+    // Shape: {3, 3}
+    std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8, 9};
+    // Shape: {3}
+    std::vector<float> expected_bias_backward = {-4, 15, 0};
+    std::vector<float> bias_backward(3);
+
+    nnfw::cker::train::FullyConnectedBiasGrad(
+      nnfw::cker::Shape{3, 3}, incoming_backward.data(),
+      nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data());
+
+    for (size_t i = 0; i < bias_backward.size(); ++i)
+      ASSERT_EQ(bias_backward[i], expected_bias_backward[i]);
+  }
+
+  {
+    // Shape: {1, 2, 2, 3}
+    std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8, 9, -10, -11, 12};
+    // Shape: {3}
+    std::vector<float> expected_bias_backward = {-14, 4, 12};
+    std::vector<float> bias_backward(3);
+
+    nnfw::cker::train::FullyConnectedBiasGrad(
+      nnfw::cker::Shape{1, 2, 2, 3}, incoming_backward.data(),
+      nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data());
+
+    for (size_t i = 0; i < bias_backward.size(); ++i)
+      ASSERT_EQ(bias_backward[i], expected_bias_backward[i]);
+  }
+}
+
+TEST(CKer_Operation, neg_FullyConnectedBiasGrad)
+{
+  {
+    // Unmatched shape
+    // Shape: {2, 4}
+    std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8};
+    // Shape: {3}
+    std::vector<float> bias_backward(3);
+    EXPECT_ANY_THROW(nnfw::cker::train::FullyConnectedBiasGrad(
+                       nnfw::cker::Shape{2, 4}, incoming_backward.data(),
+                       nnfw::cker::Shape{static_cast<int>(bias_backward.size())},
+                       bias_backward.data()););
+  }
+}
diff --git a/compute/cker/src/train/Loss.test.cc b/compute/cker/src/train/Loss.test.cc
new file mode 100644
index 000000000..98568f47a
--- /dev/null
+++ b/compute/cker/src/train/Loss.test.cc
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/train/operation/Loss.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, LossMSE)
+{
+  {
+    // Shape: {1, 10} -> m_rows:10, m_cols:1
+    std::vector<int> y_pred = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    std::vector<int> y_true = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<int> output(1);
+    std::vector<int> expected = {1};
+
+    nnfw::cker::train::MSE(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+                           y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+    EXPECT_EQ(output[0], expected[0]);
+  }
+
+  {
+    // Shape: {1, 10} -> m_rows:10, m_cols:1
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> output(1);
+    std::vector<float> expected = {1.0};
+
+    nnfw::cker::train::MSE(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+                           y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+    EXPECT_FLOAT_EQ(output[0], expected[0]);
+  }
+
+  {
+    // Shape: {2, 3} -> m_rows:3, m_cols:2
+    std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+    std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+    std::vector<float> output(1);
+    std::vector<float> expected = {110.0};
+
+    nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                           y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+    EXPECT_FLOAT_EQ(output[0], expected[0]);
+  }
+
+  {
+    // Shape: {2, 3, 4} -> m_rows:4, m_cols:6
+    std::vector<float> y_pred = {1., 2., 3., 4., 1., 2., 3., 4., 1., 2., 3., 4.,
+                                 1., 2., 3., 4., 1., 2., 3., 4., 1., 2., 3., 4.};
+    std::vector<float> y_true = {1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.,
+                                 1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.};
+    std::vector<float> output(1);
+    std::vector<float> expected = {2.1666667};
+
+    nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), nnfw::cker::Shape{2, 3, 4},
+                           y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+    EXPECT_FLOAT_EQ(output[0], expected[0]);
+  }
+}
+
+TEST(CKer_Operation, neg_LossMSE)
+{
+  {
+    // Invalid expected value
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> output(1);
+    std::vector<float> expected = {-1.0};
+
+    nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), nnfw::cker::Shape{2, 3, 4},
+                           y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+    EXPECT_NE(output[0], expected[0]);
+  }
+
+  {
+    // Invalid output shape
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> output(3);
+    std::vector<float> expected = {1.0};
+
+    EXPECT_ANY_THROW(nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(),
+                                            nnfw::cker::Shape{2, 3, 4}, y_true.data(),
+                                            nnfw::cker::Shape{3}, output.data()));
+  }
+
+  {
+    // Different y_pread and y_true shape
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5.};
+    std::vector<float> output(1);
+    std::vector<float> expected = {1.0};
+
+    EXPECT_ANY_THROW(nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(),
+                                            nnfw::cker::Shape{2, 3}, y_true.data(),
+                                            nnfw::cker::Shape{1}, output.data()));
+  }
+}
+
+TEST(CKer_Operation, LossMSEGrad)
+{
+  {
+    // Shape: {1, 10} -> m_rows:10, m_cols:1
+    std::vector<int> y_pred = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    std::vector<int> y_true = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    std::vector<int> deriv_y_pred(10);
+    std::vector<int> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+
+    for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+      EXPECT_EQ(deriv_y_pred[i], expected[i]);
+  }
+
+  {
+    // Shape: {1, 10} -> m_rows:10, m_cols:1
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> deriv_y_pred(10);
+    std::vector<float> expected = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2};
+
+    nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+
+    for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+      EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
+  }
+
+  {
+    // Shape: {2, 3} -> m_rows:3, m_cols:2
+    std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+    std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+    std::vector<float> deriv_y_pred(6);
+    std::vector<float> expected = {-1.3666667, -2.8333333, 7.4, -0.9, 2.8, 0.1666667};
+
+    nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+
+    for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+      EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
+  }
+}
+
+TEST(CKer_Operation, neg_LossMSEGrad)
+{
+  {
+    // Invalid expected value
+    std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+    std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+    std::vector<float> deriv_y_pred(6);
+    std::vector<float> expected = {1., 1., 1., 1., 1., 1.};
+
+    nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+
+    for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+      EXPECT_NE(deriv_y_pred[i], expected[i]);
+  }
+
+  {
+    // Different y_pred and y_true shape
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5.};
+    std::vector<float> deriv_y_pred(10);
+
+    EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
+                                                nnfw::cker::Shape{2, 3}, y_true.data(),
+                                                nnfw::cker::Shape{1, 10}, deriv_y_pred.data()));
+  }
+
+  {
+    // Different y_pred and deriv_y_pred shape
+    std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+    std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+    std::vector<float> deriv_y_pred(6);
+
+    EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
+                                                nnfw::cker::Shape{1, 10}, y_true.data(),
+                                                nnfw::cker::Shape{2, 3}, deriv_y_pred.data()));
+  }
+}
diff --git a/compute/cker/src/train/Relu.test.cc b/compute/cker/src/train/Relu.test.cc
new file mode 100644
index 000000000..d94411038
--- /dev/null
+++ b/compute/cker/src/train/Relu.test.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/operation/ReLU.h>
+#include <cker/train/operation/ReLU.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace
+{
+
+template <typename T> class ReluOpVerifier
+{
+public:
+  ReluOpVerifier(const std::vector<T> &input, const std::vector<T> &expected_output,
+                 const std::vector<T> &backprop_output,
+                 const std::vector<T> &expected_backprop_input)
+    : _input{input}, _expected_output{expected_output}, _backprop_output{backprop_output},
+      _expected_backprop_input{expected_backprop_input}
+  {
+    EXPECT_TRUE(input.size() == expected_output.size());
+    _output.resize(_expected_output.size());
+    _backprop_input.resize(_expected_backprop_input.size());
+  }
+
+public:
+  void verifyExpected()
+  {
+    nnfw::cker::ReLU(nnfw::cker::Shape{static_cast<int>(_input.size())}, _input.data(),
+                     nnfw::cker::Shape{static_cast<int>(_output.size())}, _output.data());
+
+    for (size_t i = 0; i < _output.size(); ++i)
+      ASSERT_EQ(_output[i], _expected_output[i]);
+
+    if (_backprop_output.size() > 0)
+    {
+      nnfw::cker::train::ReLUGrad(
+        nnfw::cker::Shape{static_cast<int>(_output.size())}, _output.data(),
+        nnfw::cker::Shape{static_cast<int>(_backprop_output.size())}, _backprop_output.data(),
+        nnfw::cker::Shape{static_cast<int>(_backprop_input.size())}, _backprop_input.data());
+
+      for (size_t i = 0; i < _backprop_input.size(); ++i)
+        ASSERT_EQ(_backprop_input[i], _expected_backprop_input[i]);
+    }
+  }
+
+private:
+  std::vector<T> _input;
+  std::vector<T> _output;
+  std::vector<T> _expected_output;
+  std::vector<T> _backprop_output;
+  std::vector<T> _backprop_input;
+  std::vector<T> _expected_backprop_input;
+};
+
+} // namespace
+
+TEST(CKer_Operation, ReLU)
+{
+  {
+    std::vector<float> input_forward = {-1, 2, 3, -4};
+    std::vector<float> expected_forward = {0, 2, 3, 0};
+    std::vector<float> incoming_backward = {-5, 6, -7, 8};
+    std::vector<float> expected_backward = {0, 6, -7, 0};
+    ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward,
+                                   expected_backward};
+    verifier.verifyExpected();
+  }
+
+  {
+    std::vector<float> input_forward = {0, -1, 2, 3, -4, 5, 6, -7};
+    std::vector<float> expected_forward = {0, 0, 2, 3, 0, 5, 6, 0};
+    std::vector<float> incoming_backward = {8, -9, 10, 11, -12, -13, 14, -15};
+    std::vector<float> expected_backward = {0, 0, 10, 11, 0, -13, 14, 0};
+    ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward,
+                                   expected_backward};
+    verifier.verifyExpected();
+  }
+}
+
+TEST(CKer_Operation, neg_ReLU)
+{
+  {
+    // Unmatched shape
+    std::vector<float> input_forward = {0, -1, 2, 3, -4};
+    std::vector<float> expected_forward = {0, 0, 2, 3, 0};
+    std::vector<float> incoming_backward = {-5, 6, -7, 8};
+    std::vector<float> expected_backward = {0, 6, -7, 0};
+    ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward,
+                                   expected_backward};
+    EXPECT_ANY_THROW(verifier.verifyExpected());
+  }
+}