Imported Upstream version 1.11.0upstream/1.11.0

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-12-14 14:43:04 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-12-14 14:43:04 +0900
commit: 12d88feea8573f8490629cf62fc342b152e57d65 (patch)
tree: 3c734cc4d629834d2d523f4575ef84cd64684e57 /compute
parent: d6b371e095d737922187a518b8faba1ef6f3a2b1 (diff)
download: nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.gz
nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.bz2
nnfw-12d88feea8573f8490629cf62fc342b152e57d65.zip
236 files changed, 43514 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt
new file mode 100644
index 000000000..58f558db2
--- /dev/null
+++ b/compute/ARMComputeEx/CMakeLists.txt
@@ -0,0 +1,36 @@
+nnfw_find_package(ARMCompute QUIET)
+
+if(NOT ARMCompute_FOUND)
+  message(STATUS "Check ARM Compute library extension build: need ARM Compute library")
+  return()
+else(NOT ARMCompute_FOUND)
+  message(STATUS "Check ARM Compute library extension build: OK")
+endif(NOT ARMCompute_FOUND)
+
+set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR})
+
+file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
+
+# generate embeded cl_kernel
+execute_process (
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    COMMAND bash -c "python resolve_includes.py"
+)
+
+add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
+target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE})
+target_link_libraries(arm_compute_ex PRIVATE arm_compute)
+target_link_libraries(arm_compute_ex PRIVATE nnfw_common)
+target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage)
+# Defines to enable validate check in debug build
+target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS
+                                                  $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED
+                                                                    ARM_COMPUTE_LOGGING_ENABLED>)
+# Validate check functions are not used on release build
+# Some parameter are used for validate check function call, and these parameter may not used on release build
+# Because clang requires to add "-Wno-unused-parameter -Wno-unused-function" after "-Wall",
+# this should be after linking nnfw_common and use interface lib linking
+add_library(ignore_unused_warning INTERFACE)
+target_compile_options(ignore_unused_warning INTERFACE -Wno-unused-parameter -Wno-unused-function)
+target_link_libraries(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:ignore_unused_warning>)
+install(TARGETS arm_compute_ex DESTINATION lib)
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
new file mode 100644
index 000000000..d29886a9d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file      CLKernelLibraryEx.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file is a cloned version of CLKernelLibrary.h in ACL. This file defines
+ *            an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL.
+ */
+
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/**
+ * @brief Class to build OpenCL kernels added from nnfw
+ * */
+class CLKernelLibraryEx
+{
+  using StringSet = std::set<std::string>;
+
+private:
+  /**
+   * @brief Construct a new CLKernelLibraryEx object
+   */
+  CLKernelLibraryEx();
+
+public:
+  /**
+   * @brief Prevent instances of this class from being copied.
+   */
+  CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied.
+   */
+  const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
+
+  /**
+   * @brief Get the KernelLibrary singleton.
+   * @return The KernelLibrary instance
+   */
+  static CLKernelLibraryEx &get();
+
+  /**
+   * @brief Initialise the kernel library.
+   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+   * @param[in] context     CL context used to create programs.
+   * @param[in] device      CL device for which the programs are created.
+   * @return N/A
+   */
+  void init(std::string kernel_path, cl::Context context, cl::Device device)
+  {
+    _kernel_path = std::move(kernel_path);
+    _context = std::move(context);
+    _device = std::move(device);
+  }
+
+  /**
+   * @brief Set the path that the kernels reside in.
+   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+   * @return N/A
+   */
+  void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
+
+  /**
+   * @brief Get the path that the kernels reside in.
+   * @return the path of kernel files
+   */
+  std::string get_kernel_path() { return _kernel_path; };
+
+  /**
+   * @brief Get the source of the selected program.
+   * @param[in] program_name Program name.
+   * @return Source of the selected program.
+   */
+  std::string get_program_source(const std::string &program_name);
+
+  /**
+   * @brief Set the CL context used to create programs.
+   * @note Setting the context also resets the device to the
+   *       first one available in the new context.
+   * @param[in] context A CL context.
+   * @return N/A
+   */
+  void set_context(cl::Context context)
+  {
+    _context = std::move(context);
+    if (_context.get() == nullptr)
+    {
+      _device = cl::Device();
+    }
+    else
+    {
+      const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+      if (cl_devices.empty())
+      {
+        _device = cl::Device();
+      }
+      else
+      {
+        _device = cl_devices[0];
+      }
+    }
+  }
+
+  /**
+   * @brief Return associated CL context.
+   * @return A CL context.
+   */
+  cl::Context &context() { return _context; }
+
+  /**
+   * @brief Set the CL device for which the programs are created.
+   * @param[in] device A CL device.
+   * @return N/A
+   */
+  void set_device(cl::Device device) { _device = std::move(device); }
+
+  /**
+   * @brief Gets the CL device for which the programs are created.
+   * @return A CL device.
+   */
+  cl::Device &get_device() { return _device; }
+
+  /**
+   * @brief Return the device version
+   * @return The content of CL_DEVICE_VERSION
+   */
+  std::string get_device_version();
+
+  /**
+   * @brief Create a kernel from the kernel library.
+   * @param[in] kernel_name       Kernel name.
+   * @param[in] build_options_set Kernel build options as a set.
+   * @return The created kernel.
+   */
+  Kernel create_kernel(const std::string &kernel_name,
+                       const StringSet &build_options_set = {}) const;
+
+  /**
+   * @brief Find the maximum number of local work items in a workgroup can be supported for the
+   * kernel.
+   * @param[in] kernel       kernel object
+   */
+
+  size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
+  /**
+   * @brief Return the default NDRange for the device.
+   * @return default NDRangeof the device
+   */
+  cl::NDRange default_ndrange() const;
+
+  /**
+   * @brief Clear the library's cache of binary programs
+   * @return N/A
+   */
+  void clear_programs_cache()
+  {
+    _programs_map.clear();
+    _built_programs_map.clear();
+  }
+
+  /**
+   * @brief Access the cache of built OpenCL programs
+   * @return program map data structure of which key is name of kernel and value is
+   *         kerel source name. (*.cl)
+   */
+  const std::map<std::string, cl::Program> &get_built_programs() const
+  {
+    return _built_programs_map;
+  }
+
+  /**
+   * @brief Add a new built program to the cache
+   * @param[in] built_program_name Name of the program
+   * @param[in] program            Built program to add to the cache
+   * @return N/A
+   */
+  void add_built_program(const std::string &built_program_name, cl::Program program);
+
+  /**
+   * @brief Returns true if FP16 is supported by the CL device
+   * @return true if the CL device supports FP16
+   */
+  bool fp16_supported() const;
+
+  /**
+   * @brief Returns true if int64_base_atomics extension is supported by the CL device
+   * @return true if the CL device supports int64_base_atomics extension
+   */
+  bool int64_base_atomics_supported() const;
+
+private:
+  /**
+   * @brief Load program and its dependencies.
+   * @param[in] program_name Name of the program to load.
+   */
+  const Program &load_program(const std::string &program_name) const;
+  /**
+   * @brief Concatenates contents of a set into a single string.
+   * @param[in] s Input set to concatenate.
+   * @return Concatenated string.
+   */
+  std::string stringify_set(const StringSet &s) const;
+
+  cl::Context _context;     /**< Underlying CL context. */
+  cl::Device _device;       /**< Underlying CL device. */
+  std::string _kernel_path; /**< Path to the kernels folder. */
+  mutable std::map<std::string, const Program>
+      _programs_map; /**< Map with all already loaded program data. */
+  mutable std::map<std::string, cl::Program>
+      _built_programs_map; /**< Map with all already built program data. */
+  static const std::map<std::string, std::string>
+      _kernel_program_map; /**< Map that associates kernel names with programs. */
+  static const std::map<std::string, std::string>
+      _program_source_map; /**< Contains sources for all programs.
+                                Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
new file mode 100644
index 000000000..a0aa0560b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
+#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ *
+ * @note The default data type for an uninitialized output tensor is
+ *       signed 32-bit integer (S32). It is the user's responsibility to check
+ *       that the results do not overflow because the indices are computed
+ *       in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerKernelEx : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLArgMinMaxLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLArgMinMaxLayerKernelEx() = default;
+
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
+   * @param[in]  prev_output Destination tensor of the previous iterations of @ref
+   * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32
+   *                         Has to be nullptr for the first iteration
+   * @param[out] output      Destination tensor. Data types supported: U32/S32
+   *                         Output will have the same number of dimensions as input.
+   * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
+   * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output,
+                 unsigned int axis, ReductionOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArgMinMaxLayerKernelEx.
+   *
+   * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
+   * @param[in] prev_output Destination tensor info of the previous iterations. Data types
+   * supported: U32/S32
+   *                        Has to be nullptr for the first iteration
+   * @param[in] output      Destination tensor info. Data types supported: U32/S32
+   *                        Output will have the same number of dimensions as input.
+   * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
+   * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output,
+                         const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_prev_output;
+  ICLTensor *_output;
+  unsigned int _reduction_axis;
+  ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
new file mode 100644
index 000000000..bb6fcb8f5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/
+class CLBinaryLogicalOpKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLBinaryLogicalOpKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input1  Source tensor1.
+   * @param[in]  input2  Source tensor2.
+   * @param[out] output  Output tensor.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
new file mode 100644
index 000000000..ed668fd9c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file      CLCastBoolKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLCastBoolKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
+#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class for the kernel converting boolean type
+ */
+class CLCastBoolKernel : public ICLSimple3DKernel
+{
+public:
+  /**
+   * @brief Initialise the kernel's input and output.
+   * @param[in]  input  Input tensor. Data types supported: U8
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLCastBoolKernel
+   *
+   * @param[in] input  Source tensor info. Data types supported: U8.
+   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
new file mode 100644
index 000000000..a614d5259
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file      CLEmbeddingLookupKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLEmbeddingLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform EmbeddingLookup operation with opencl kernel
+*/
+class CLEmbeddingLookupKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct a CLEmbeddingLookupKernel object
+   * */
+  CLEmbeddingLookupKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete;
+
+  /**
+   * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor
+   * @param[in] CLEmbeddingLookupKernel object to move
+   * */
+  CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLEmbeddingLookupKernel object to move
+   * */
+  CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLEmbeddingLookupKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
+   * @param[in]  input          Source tensor.
+   *                            Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  lookups        Lookups are 1D tensor that values are indices into the first
+   *                            dimension of input.
+   *                            Data types supported: S32.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLEmbeddingLookupKernel
+   * @param[in]  input          The input tensor info.
+   *                            Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
+   * @param[in]  lookups        Lookups info. Data types supported: S32.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;   /** Source tensor */
+  ICLTensor *_output;        /** Destination tensor */
+  const ICLTensor *_lookups; /** Lookups tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
new file mode 100644
index 000000000..6630c7be7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file      CLGatherExKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLGatherExKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
+#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define an interface for the gather kernel.
+ */
+class CLGatherExKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct CLGatherExKernel object
+   * */
+  CLGatherExKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
+  CLGatherExKernel(const CLGatherExKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
+  CLGatherExKernel &operator=(const CLGatherExKernel &) = delete;
+
+  /**
+   * @brief Construct CLGatherExKernel object by using default move constructor
+   * @param[in] CLGatherExKernel object to move
+   */
+  CLGatherExKernel(CLGatherExKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLGatherExKernel object to move
+   */
+  CLGatherExKernel &operator=(CLGatherExKernel &&) = default;
+
+  /**
+   * @brief Initialise the kernel's input, output and border mode.
+   * @param[in]  input           An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices         Indices tensor. Data types supported: S32.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative
+   * values wrap around. Defaults to 0
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   * CLGatherExKernel
+   * @param[in]  input           An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices         Indices tensor. Data types supported: S32.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative
+   * values wrap around. Defaults to 0
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis = 0);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_indices;
+  ICLTensor *_output;
+  int _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
new file mode 100644
index 000000000..99cfa61ec
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file      CLHashtableLookupKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLHashtableLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform HashtableLookup operation with opencl kernel
+*/
+class CLHashtableLookupKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct a CLHashtableLookupKernel object
+   * */
+  CLHashtableLookupKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete;
+
+  /**
+   * @brief Construct a CLHashtableLookupKernel object by using default move constructor
+   * @param[in] CLHashtableLookupKernel object to move
+   * */
+  CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLHashtableLookupKernel object to move
+   * */
+  CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLHashtableLookupKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input,
+                 ICLTensor *output, ICLTensor *hits);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLHashtableLookupKernel
+   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
+   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    The input tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   The output tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
+   *                      hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_lookups{nullptr};                 /** Lookups tensor */
+  const ICLTensor *_keys{nullptr};                    /** Keys tensor */
+  const ICLTensor *_input{nullptr};                   /** Source tensor */
+  ICLTensor *_output{nullptr};                        /** Destination tensor */
+  ICLTensor *_hits{nullptr};                          /** Hits tensor */
+  std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
new file mode 100644
index 000000000..f57e799ad
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
+#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for performing an instance normalization */
+class CLInstanceNormalizationLayerKernelEx : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLInstanceNormalizationLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLInstanceNormalizationLayerKernelEx &
+  operator=(const CLInstanceNormalizationLayerKernelEx &) = delete;
+  /** Default Move Constructor. */
+  CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default;
+  /** Default move assignment operator */
+  CLInstanceNormalizationLayerKernelEx &
+  operator=(CLInstanceNormalizationLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLInstanceNormalizationLayerKernelEx() = default;
+
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported:
+   * NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      beta    (Optional) The offset tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
+                 ICLTensor *beta = nullptr, float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLInstanceNormalizationLayerEx.
+   *
+   * @param[in] input   Source tensor info. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                    Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] beta    (Optional) The offset tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  ICLTensor *_gamma;
+  ICLTensor *_beta;
+  float _epsilon;
+  bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
new file mode 100644
index 000000000..90e8b5705
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
+#define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to multiply scale factor kernel. */
+class CLMultiplyScaleFactorKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLMultiplyScaleFactorKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMultiplyScaleFactorKernel(const CLMultiplyScaleFactorKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMultiplyScaleFactorKernel &operator=(const CLMultiplyScaleFactorKernel &) = delete;
+  /** Default Move Constructor. */
+  CLMultiplyScaleFactorKernel(CLMultiplyScaleFactorKernel &&) = default;
+  /** Default move assignment operator */
+  CLMultiplyScaleFactorKernel &operator=(CLMultiplyScaleFactorKernel &&) = default;
+  /** Default destructor */
+  ~CLMultiplyScaleFactorKernel() = default;
+  /** Set input, output tensors.
+   *
+   * @param[in/out] input  Source tensor. Data type supported: S32.
+   * @param[in]     scale_factor Scale tensor. Data type supported: F16/F32.
+   * @param[out]    output Destination tensor. Data type supported: Same as @p scale_factor.
+   * @param[in]     multiplier Additional scale value.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output,
+                 float multiplier = 1.f);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLMultiplyScaleFactorKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: S32.
+   * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor.
+   * @param[in] multiplier Additional scale value.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                         const ITensorInfo *output);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_scale_factor;
+  ICLTensor *_output;
+  float _multiplier;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
new file mode 100644
index 000000000..fa383c0d0
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
+#define __ARM_COMPUTE_CLNEGKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a negation operation on tensor*/
+class CLNegKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLNegKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLNegKernel(const CLNegKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLNegKernel &operator=(const CLNegKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLNegKernel(CLNegKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLNegKernel &operator=(CLNegKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input  Source tensor.
+   * @param[out] output Destination tensor.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
new file mode 100644
index 000000000..a512057b9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
+#define __ARM_COMPUTE_CLONEHOTKERNEL_H__
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the kernel to perform one-hot encoding*/
+class CLOneHotKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLOneHotKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHotKernel(const CLOneHotKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHotKernel &operator=(const CLOneHotKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLOneHotKernel(CLOneHotKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLOneHotKernel &operator=(CLOneHotKernel &&) = default;
+  /** Default destructor */
+  ~CLOneHotKernel() = default;
+  /** Initialise the kernel's inputs and output
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
+                 ICLTensor *output, int depth, int axis = -1);
+  /** Initialise the kernel's inputs and output already initialized to off_value
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth,
+                 int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLOneHotKernel
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                         const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                         int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLOneHotKernel without off_value
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                         const ITensorInfo *output, int depth, int axis = -1);
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  /** Initialise the kernel's inputs and outputs internally
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+                        int depth, int axis);
+
+private:
+  const ICLTensor *_indices;   /**< Indices tensor */
+  const ICLTensor *_on_value;  /**< On value tensor */
+  const ICLTensor *_off_value; /**< Off value tensor */
+  ICLTensor *_output;          /**< Destination tensor */
+  bool _is_off_value_memset;   /**< Whether off_value is zero */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
new file mode 100644
index 000000000..4e1b56cba
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
+#define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 2D input tensors.
+ */
+class CLQuantizationSymmetricKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLQuantizationSymmetricKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLQuantizationSymmetricKernel(const CLQuantizationSymmetricKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLQuantizationSymmetricKernel &operator=(const CLQuantizationSymmetricKernel &) = delete;
+  /** Default Move Constructor. */
+  CLQuantizationSymmetricKernel(CLQuantizationSymmetricKernel &&) = default;
+  /** Default move assignment operator */
+  CLQuantizationSymmetricKernel &operator=(CLQuantizationSymmetricKernel &&) = default;
+  /** Default destructor */
+  ~CLQuantizationSymmetricKernel() = default;
+  /** Set the input, output.
+   *
+   * @param[in]  input  Source tensor. Data types supported: F32/F16.
+   * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+   * @param[out] output Destination tensor with the same dimensions of input. Data types supported:
+   * S8.
+   *
+   * @note Output auto initialization is not supported by this kernel
+   */
+  void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLQuantizationSymmetricKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: F32/F16.
+   * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+   * @param[in] output Destination tensor info with the same dimensions of input. Data types
+   * supported: S8.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                         const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_scale_factor;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
new file mode 100644
index 000000000..4f9042e41
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLReduceOperationKernel.h
+ * @brief This file defines CLReduceOperationKernel class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the reduce operation kernel
+ */
+class CLReduceOperationKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Default constructor
+   */
+  CLReduceOperationKernel();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLReduceOperationKernel(const CLReduceOperationKernel &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   */
+  CLReduceOperationKernel(CLReduceOperationKernel &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   */
+  CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default;
+  /**
+   * @brief Default destructor
+   */
+  ~CLReduceOperationKernel() = default;
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input  Source tensor. Data types supported: U8/S32/F32.
+   * @param[out] output Destination tensor. Data types supported: Same as @p input.
+   *                    Output will have the same number of dimensions as input.
+   * @param[in]  axis   Axis along which to reduce.
+   * @param[in]  op     Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+                 ReductionOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperationKernel.
+   * @param[in] input  Source tensor info. Data types supported: U8/S32/F32.
+   * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+   *                   Output will have the same number of dimensions as input.
+   * @param[in] axis   Axis along which to reduce.
+   * @param[in] op     Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                         ReductionOperation op);
+
+  /*
+   * @brief Run CLReduceOperationKernel op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   CLQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
new file mode 100644
index 000000000..4d4478ece
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
+#define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to perform min max search on a 3D tensor.
+ */
+class CLScaleFactorSymm8Kernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLScaleFactorSymm8Kernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLScaleFactorSymm8Kernel(const CLScaleFactorSymm8Kernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLScaleFactorSymm8Kernel &operator=(const CLScaleFactorSymm8Kernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLScaleFactorSymm8Kernel(CLScaleFactorSymm8Kernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLScaleFactorSymm8Kernel &operator=(CLScaleFactorSymm8Kernel &&) = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor with 2 dimensions. The first dimension will be interpreted as
+   * batches. Data types supported: F32.
+   * @param[out] output Output tensor with shape [batches] which stores the scale values for each 2D
+   * input tensor.
+   *                    The dimensions over the first must match the batched dimensions of the input
+   * tensor. Data types supported: F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLScaleFactorSymm8Kernel
+   *
+   * @param[in] input  Input tensor info.  Data types supported: F32.
+   * @param[in] output Output tensor info with shape [batches] which stores the scale values for
+   * each 2D input tensor.
+   *                   The dimensions over the first must match the batched dimensions of the input
+   * tensor. Data types supported: F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+  /** Resets global minimum and maximum
+   *
+   * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
+   */
+  void reset(cl::CommandQueue &queue);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
new file mode 100644
index 000000000..aa4a14812
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLTopKV2Kernel.h
+ * @brief This file defines classes for TopKV2Kernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+// these parameters can be changed
+#define _ITEMS 16                          // number of items in a group
+#define _GROUPS 4                          // the number of virtual processors is _ITEMS * _GROUPS
+#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram
+#define PERMUT                             // store the final permutation
+////////////////////////////////////////////////////////
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define CLTopKV2Single
+ */
+class CLTopKV2Single : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2Single();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+   */
+  CLTopKV2Single(const CLTopKV2Single &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+   */
+  CLTopKV2Single(CLTopKV2Single &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[in] input An input tensor
+   * @param[in] topk_values Values of the top k predictions
+   * @param[in] topk_indices Indices of the top k predictions
+   * @param[in] indices Indices
+   * @param[in] temp_stack Temp stack
+   * @param[in] k K of the top k predictions
+   * @param[in] n Number times to quick-sort
+   * return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+                 cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
+
+  /*
+   * @brief Run CLTopKV2Single op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_topk_values;
+  ICLTensor *_topk_indices;
+};
+
+/**
+ * @brief Class to define CLTopKV2Init
+ */
+class CLTopKV2Init : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2Init();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+   */
+  CLTopKV2Init(const CLTopKV2Init &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+   */
+  CLTopKV2Init(CLTopKV2Init &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[in] input An input tensor
+   * @param[in] in_key_buf Buffer of input key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[in] n Number times to quick-sort
+   * return N/A
+   */
+  void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
+
+  /*
+   * @brief Run CLTopKV2Init op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+};
+
+/**
+ * @brief Class to define CLRadixSortHistogram
+ */
+class CLRadixSortHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+   */
+  CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+   */
+  CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * @param[in] n Integer number size to sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, int bits, int n);
+
+  /**
+   * @brief Set pass
+   * @param[in] pass Passes made of in radix sort algorithm
+   * @param[in] in_key_buf Buffer of input key
+   * return N/A
+   */
+  void setPass(int pass, cl::Buffer *in_key_buf)
+  {
+    _pass = pass;
+    _in_key_buf = in_key_buf;
+  }
+
+  /*
+   * @brief Run CLRadixSortHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  int _pass;
+  cl::Buffer *_in_key_buf;
+};
+
+/**
+ * @brief Class to define CLRadixSortScanHistogram
+ */
+class CLRadixSortScanHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortScanHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+   */
+  CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+   */
+  CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+  /*
+   * @brief Run CLRadixSortScanHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortGlobalScanHistogram
+ */
+class CLRadixSortGlobalScanHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortGlobalScanHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+   */
+  CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+   */
+  CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
+  void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
+
+  /*
+   * @brief Run CLRadixSortGlobalScanHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortPasteHistogram
+ */
+class CLRadixSortPasteHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortPasteHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+   */
+  CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+   */
+  CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+  /*
+   * @brief Run CLRadixSortPasteHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortReorder
+ */
+class CLRadixSortReorder : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortReorder();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+   */
+  CLRadixSortReorder(const CLRadixSortReorder &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+   */
+  CLRadixSortReorder(CLRadixSortReorder &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * @param[in] n Integer number size to sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, int bits, int n);
+
+  /**
+   * @brief Set pass
+   * @param[in] pass Passes made of in radix sort algorithm
+   * @param[in] in_key_buf Buffer of input key
+   * @param[out] out_key_buf Buffer of output key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
+  void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+               cl::Buffer *out_ind_buf)
+  {
+    _pass = pass;
+    _in_key_buf = in_key_buf;
+    _out_key_buf = out_key_buf;
+    _in_ind_buf = in_ind_buf;
+    _out_ind_buf = out_ind_buf;
+  }
+  /*
+   * @brief Run CLRadixSortReorder op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  int _pass;
+  cl::Buffer *_in_key_buf;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_in_ind_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2FindFirstNegative
+ */
+class CLTopKV2FindFirstNegative : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2FindFirstNegative();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+   */
+  CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+   */
+  CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] first_negative_idx_buf Buffer of the first negative index
+   * @param[in] n Number times to find
+   * return N/A
+   */
+  void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+  /**
+   * @brief Set output buffer
+   * @param[out] out_key_buf Buffer of output key
+   * return N/A
+   */
+  void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
+
+  /*
+   * @brief Run CLTopKV2FindFirstNegative op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  cl::Buffer *_out_key_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2ReorderNegatives
+ */
+class CLTopKV2ReorderNegatives : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2ReorderNegatives();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+   */
+  CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+   */
+  CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] first_negative_idx_buf Buffer of the first negative index
+   * @param[in] n Number times to find
+   * return N/A
+   */
+  void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+  /**
+   * @brief Set buffers
+   * @param[in] in_key_buf Buffer of input key
+   * @param[out] out_key_buf Buffer of output key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
+  void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+                  cl::Buffer *out_ind_buf)
+  {
+    _in_key_buf = in_key_buf;
+    _out_key_buf = out_key_buf;
+    _in_ind_buf = in_ind_buf;
+    _out_ind_buf = out_ind_buf;
+  }
+
+  /*
+   * @brief Run CLTopKV2ReorderNegatives op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  cl::Buffer *_in_key_buf;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_in_ind_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2Store
+ */
+class CLTopKV2Store : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2Store();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+   */
+  CLTopKV2Store(const CLTopKV2Store &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+   */
+  CLTopKV2Store(CLTopKV2Store &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] values Values tensor to store
+   * @param[out] indices Indices tensor to be used for store
+   * @param[in] k K of the top k predictions
+   * @param[in] n Number times to store
+   * return N/A
+   */
+  void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
+
+  /**
+   * @brief Set buffers
+   * @param[out] out_key_buf Buffer of output key
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
+  void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
+
+  /*
+   * @brief Run CLTopKV2Store op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_values;
+  ICLTensor *_indices;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+} // namespace arm_compute
+#endif // Disable GPU implementation
+#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
new file mode 100644
index 000000000..933d8760d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
+#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+class QuantizationInfo;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    float (*scalar_func)(const float &, const float &),
+                    int (*broadcast_func)(int, int, int, const float *, const float &, float *,
+                                          const bool),
+                    int (*neon_func)(int, int, int, const float *, const float *, float *));
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
+                    int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
+                                          uint8_t *, const bool),
+                    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *));
+} // namespace arm_compute
+#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
new file mode 100644
index 000000000..8c544cda8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+
+class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
+{
+public:
+  /** Default destructor */
+  ~NEBinaryLogicalOperationKernel() = default;
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] op     Binary logical operation to be executed.
+   * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor. Data types supported: Same as @p input1.
+   */
+  void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] op     Binary logical operation to be executed.
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   *
+   * @return a Status
+   */
+  static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1,
+                         const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+  // Inherited methods overridden:
+  static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
+                                   const ITensorInfo &output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
new file mode 100644
index 000000000..101f6ac8e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
+#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class for the kernel converting boolean type
+ */
+class NECastBoolKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NECastBoolKernel"; }
+  /** Default constructor*/
+  NECastBoolKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NECastBoolKernel(const NECastBoolKernel &) = delete;
+  /** Default move constructor */
+  NECastBoolKernel(NECastBoolKernel &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NECastBoolKernel &operator=(const NECastBoolKernel &) = delete;
+  /** Default move assignment operator */
+  NECastBoolKernel &operator=(NECastBoolKernel &&) = default;
+  /** Set the input and output of the kernel
+   *
+   * Valid conversions Input -> Output :
+   *
+   *   - U8             -> U8, S8, U16, S16, U32, S32, F32, F16
+   *
+   * @param[in]  input  The input tensor to convert. Data types supported: U8
+   * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NECastBoolKernel
+   *
+   * @param[in] input  Source tensor info. Data types supported: U8
+   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
new file mode 100644
index 000000000..88f21c96e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform EmbeddingLookup operation */
+class NEEmbeddingLookupKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEEmbeddingLookupKernel"; }
+  /** Default constructor */
+  NEEmbeddingLookupKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input   Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output  Destination tensor. Data types supported: same as @p input.
+   * @param[in]  lookups Lookups are 1D tensor that values are indices into the first dimension of
+   * input.
+   */
+  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEEmbeddingLookupKernel
+   *
+   * @param[in] input   Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output  Destination tensor. Data types supported: same as @p input.
+   * @param[in] lookups Lookups info. Data types supported: S32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input;
+  const ITensor *_lookups;
+  ITensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
new file mode 100644
index 000000000..5acfde5a8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
+#define __ARM_COMPUTE_NEGATHERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform other operation on NEON */
+class NEGatherKernelEx : public INEKernel
+{
+public:
+  /** Default constructor. */
+  NEGatherKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEGatherKernelEx(const NEGatherKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete;
+  /** Allow instances of this class to be moved. */
+  NEGatherKernelEx(NEGatherKernelEx &&) = default;
+  /** Allow instances of this class to be moved. */
+  NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default;
+  /** Default detructor */
+  ~NEGatherKernelEx() = default;
+
+  /** Name of the kernel
+   *
+   * @return Kernel name
+   */
+  const char *name() const override { return "NEGatherKernelEx"; }
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values
+   * wrap around. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values
+   * wrap around. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Implementation of the gather operation for 0 axis.
+   *
+   * For gather on the 0 axis an element by element copy is performed.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info);
+
+  /** Implementation of the gather operation.
+   *
+   * For 1<=axis a row-wise copy is taking place.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info);
+
+  using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info);
+
+  const ITensor *_input;
+  const ITensor *_indices;
+  int _axis;
+  size_t _indices_rank;
+  ITensor *_output;
+  kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
new file mode 100644
index 000000000..cb2a485d5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HashtableLookup operation */
+class NEHashtableLookupKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEHashtableLookupKernel"; }
+  /** Default constructor */
+  NEHashtableLookupKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default;
+  /** Initialize the kernel's inputs, outputs.
+   *
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   * input. Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   * (True) or not (False). Data types supported: U8/QASYMM8
+   * input.
+   */
+  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+                 ITensor *hits);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEHashtableLookupKernel
+   *
+   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
+   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    The input tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   The output tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
+   * hits (True) or not (False). Data types supported: U8/QASYMM8
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_lookups; /** Lookups tensor */
+  const ITensor *_keys;    /** Keys tensor */
+  const ITensor *_input;   /** Source tensor */
+  ITensor *_output;        /** Destination tensor */
+  ITensor *_hits;          /** Hits tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
new file mode 100644
index 000000000..8724cc69b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for performing an instance normalization */
+class NEInstanceNormalizationLayerKernelEx : public INEKernel
+{
+public:
+  const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; }
+  /** Default constructor */
+  NEInstanceNormalizationLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEInstanceNormalizationLayerKernelEx &
+  operator=(const NEInstanceNormalizationLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NEInstanceNormalizationLayerKernelEx &
+  operator=(NEInstanceNormalizationLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~NEInstanceNormalizationLayerKernelEx() = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported:
+   * NCHW
+   *                         In case of @p output tensor = nullptr this tensor will store the result
+   * of the normalization.
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
+   * Defaults to 1.0
+   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr,
+                 float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEInstanceNormalizationLayer.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
+   * to 1.0
+   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Common signature for all the specialized instance normalization functions
+   *
+   * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   * @param[out]     output  The output tensor.
+   * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to
+   * 1.0
+   * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to
+   * 0.0
+   * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
+   */
+  using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                                     float epsilon, const Window &window);
+
+  NormalizationFunction *_func;
+  ITensor *_input;
+  ITensor *_output;
+  ITensor *_gamma;
+  ITensor *_beta;
+  float _epsilon;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
new file mode 100644
index 000000000..198b0be9d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
+#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to multiply scale factor kernel. */
+class NEMultiplyScaleFactorKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEMultiplyScaleFactorKernel"; }
+  /** Default constructor */
+  NEMultiplyScaleFactorKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete;
+  /** Default Move Constructor. */
+  NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default;
+  /** Default move assignment operator */
+  NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default;
+  /** Default destructor */
+  ~NEMultiplyScaleFactorKernel() = default;
+  /** Set input, output tensors.
+   *
+   * @param[in/out] input  Source tensor. Data type supported: S32.
+   * @param[in]     scale_factor Scale tensor. Data type supported: F16/F32.
+   * @param[out]    output Destination tensor. Data type supported: Same as @p scale_factor.
+   */
+  void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output,
+                 float multiplier = 1.f);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEMultiplyScaleFactorKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: S32.
+   * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                         const ITensorInfo *output, float multiplier = 1.f);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  template <typename T> void multiply(const Window &window);
+
+private:
+  const ITensor *_input;
+  const ITensor *_scale_factor;
+  ITensor *_output;
+  float _multiplier;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
new file mode 100644
index 000000000..99bb351bc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
+#define __ARM_COMPUTE_NEONEHOTKERNEL_H__
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Kernel to perform other operation on NEON */
+class NEOneHotKernel : public INEKernel
+{
+public:
+  /** Default constructor. */
+  NEOneHotKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEOneHotKernel(const NEOneHotKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEOneHotKernel &operator=(const NEOneHotKernel &) = delete;
+  /** Allow instances of this class to be moved. */
+  NEOneHotKernel(NEOneHotKernel &&) = default;
+  /** Allow instances of this class to be moved. */
+  NEOneHotKernel &operator=(NEOneHotKernel &&) = default;
+  /** Default detructor */
+  ~NEOneHotKernel() = default;
+  /** Name of the kernel
+   *
+   * @return Kernel name
+   */
+  const char *name() const override { return "NEOneHotKernel"; }
+  /** Initialise the kernel's inputs and outputs
+   *
+ * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up to
+ * 3. Must be one of the following types: U32/S32
+ * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same
+ * as @p on_value
+ * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+ * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                 const ITensor *off_value, ITensor *output, int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEOneHotKernel
+   *
+ * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
+ * up to 3. Must be one of the following types: U32/S32
+ * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+ * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                         const ITensorInfo *on_value, const ITensorInfo *off_value,
+                         const ITensorInfo *output, int axis = -1);
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Implementation of the onehot operation for 0 axis.
+   *
+   * For onehot on the 0 axis an element by element copy is performed.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info);
+  /** Implementation of the onehot operation.
+   *
+   * For 1<=axis a row-wise copy is taking place.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info);
+  using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info);
+  const ITensor *_indices;
+  const ITensor *_depth;
+  const ITensor *_on_value;
+  const ITensor *_off_value;
+  int _axis;
+  ITensor *_output;
+  kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
new file mode 100644
index 000000000..0b080cf73
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
+#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the dequantization layer kernel. */
+class NEQuantizationSymmetricKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEQuantizationSymmetricKernel"; }
+  /** Default constructor */
+  NEQuantizationSymmetricKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete;
+  /** Default Move Constructor. */
+  NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default;
+  /** Default move assignment operator */
+  NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default;
+  /** Default destructor */
+  ~NEQuantizationSymmetricKernel() = default;
+  /** Set input, output tensors.
+   *
+   * @param[in]  input  Source tensor. Data type supported: F16/F32.
+   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+   * S8.
+   * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output, ITensor *scale_factor);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEQuantizationSymmetricKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: F16/F32.
+   * @param[in] output Output tensor info. Data types supported: S8.
+   * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *scale_factor);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  template <typename T> void quantize(const Window &window);
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+  ITensor *_scale_factor;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
new file mode 100644
index 000000000..cda8a30b1
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_TYPESEX_H__
+#define __ARM_COMPUTE_TYPESEX_H__
+
+namespace arm_compute
+{
+
+/** Available ArgIndex operations **/
+enum class ArgOperation
+{
+  MAX,
+  MIN,
+};
+
+/** Available binary logical operations */
+enum class BinaryLogicalOperation
+{
+  AND, /**< AND */
+  OR,  /**< OR */
+};
+
+enum class ComparisonOperationEx
+{
+  EQUAL,     /**< EQUAL */
+  NOT_EQUAL, /**< NOT_EQUAL */
+};
+
+enum class ElementWiseUnaryEx
+{
+  NEG, /**< NEG */
+};
+
+enum class SubDataType
+{
+  NONE,
+  BOOL,
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TYPESEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
new file mode 100644
index 000000000..d57e8fcf5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_UTILSEX_H__
+#define __ARM_COMPUTE_UTILSEX_H__
+
+#include <utility>
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+
+/** Returns expected width and height of the transpose convolution's output tensor.
+ *
+ * @note This function was copied in order to fix a bug computing to wrong output dimensions.
+ *
+ * @param[in] in_width      Width of input tensor (Number of columns)
+ * @param[in] in_height     Height of input tensor (Number of rows)
+ * @param[in] kernel_width  Kernel width.
+ * @param[in] kernel_height Kernel height.
+ * @param[in] info          padding and stride info.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_top   The number of zeros added to bottom edge of the output.
+ *
+ * @return A pair with the new width in the first position and the new height in the second.
+ */
+const std::pair<unsigned int, unsigned int>
+transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                unsigned int kernel_width, unsigned int kernel_height,
+                                const PadStrideInfo &info, unsigned int invalid_right,
+                                unsigned int invalid_top);
+}
+#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
new file mode 100644
index 000000000..1e69f0912
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace shape_calculator
+{
+
+/** Calculate the upsampled output shape used for transpose convolution
+ *
+ * @param[in] input              Input tensor info
+ * @param[in] weights            Weights tensor shape
+ * @param[in] info               Padding and stride info
+ * @param[in] out_dims           Output shape dimensions
+ * @param[in] invalid_right      The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom     The number of zeros added to bottom edge of the output.
+ * @param[out] pad_left          Padding on left
+ * @param[out] pad_right         Padding on right
+ * @param[out] pad_top           Padding on top
+ * @param[out] pad_bottom        Padding on bottom
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_transposeconv_upsampled_shape(
+    const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+    std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
+    unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
+    unsigned int &pad_top, unsigned int &pad_bottom)
+{
+  unsigned int sx = info.stride().first;
+  unsigned int sy = info.stride().second;
+  const DataLayout data_layout = input.data_layout();
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  // Find the upsampled dimensions
+  // transpose conv out:
+  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
+  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
+  // upsample out:
+  //    upsample_out = 1 + (in - 1) * stride
+  unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
+  unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+
+  // Find the padding needed for the convolution with stride 1 in order to match output shape
+  // upsample+pad out:
+  //    upsample_out + pad = tconv_out + kernel - 1
+  //    pad = tconv_out + kernel - 1 - upsample_out
+  unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
+  unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
+  out_x += padx;
+  out_y += pady;
+
+  unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
+  unsigned int pady_all_except_invallid =
+      pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
+  pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
+  pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
+  pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
+  pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom;
+
+  TensorShape scale_out_shape(input.tensor_shape());
+  scale_out_shape.set(idx_w, out_x);
+  scale_out_shape.set(idx_h, out_y);
+
+  return scale_out_shape;
+}
+
+/** Calculate the output shape of the transpose convolution layer
+ *
+ * @param[in] out_dims Output x and y shape dimensions
+ * @param[in] input    Input tensor info
+ * @param[in] weights  Weights tensor shape
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                   const ITensorInfo &input, const ITensorInfo &weights)
+{
+  const TensorShape input_shape{input.tensor_shape()};
+  const TensorShape weights_shape{weights.tensor_shape()};
+
+  const DataLayout data_layout = input.data_layout();
+  const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int channel_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+  TensorShape out_shape{input_shape};
+  out_shape.set(width_idx, out_dims.first);
+  out_shape.set(height_idx, out_dims.second);
+  out_shape.set(channel_idx, weights_shape[batch_idx]);
+  return out_shape;
+}
+
+/** Calculate the depth to space output shape of a tensor
+ *
+ * @param[in] input Input tensor info
+ * @param[in] block Block shape value
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block)
+{
+  ARM_COMPUTE_ERROR_ON(block < 2);
+
+  const DataLayout data_layout = input->data_layout();
+  const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int idx_channel =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  TensorShape output_shape{input->tensor_shape()};
+  output_shape.set(idx_width, input->dimension(idx_width) * block);
+  output_shape.set(idx_height, input->dimension(idx_height) * block);
+  output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
+
+  return output_shape;
+}
+
+/** Calculate the space to batch output shape of a tensor
+ *
+ * @param[in] input       Input tensor info
+ * @param[in] block_shape Block shape value
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape)
+{
+  ARM_COMPUTE_ERROR_ON(block_shape < 2);
+  TensorShape output_shape{input->tensor_shape()};
+
+  const DataLayout data_layout = input->data_layout();
+  const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape);
+  output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape);
+  output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape));
+
+  return output_shape;
+}
+
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis   The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
+                                           const TensorShape &indices_shape, uint32_t actual_axis)
+{
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
+  ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+
+  TensorShape output_shape = input_shape;
+  if (indices_shape.num_dimensions() == 1)
+  {
+    output_shape[actual_axis] = indices_shape[0];
+  }
+  else if (indices_shape.num_dimensions() > 1)
+  {
+    output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+    {
+      if (o == actual_axis)
+      {
+        ++i;
+        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+        {
+          output_shape[o] = indices_shape[in];
+        }
+      }
+      else
+      {
+        output_shape[o] = input_shape[i];
+      }
+    }
+  }
+  return output_shape;
+}
+
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis   The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth,
+                                           uint32_t actual_axis)
+{
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions());
+
+  TensorShape output_shape;
+  output_shape.set(actual_axis, depth);
+
+  unsigned int i_shift = 0;
+  for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i)
+  {
+    if (i == actual_axis)
+    {
+      i_shift++;
+    }
+    output_shape.set(i + i_shift, indices_shape[i]);
+  }
+
+  return output_shape;
+}
+
+} // namespace shape_calculator
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
new file mode 100644
index 000000000..484ebfd0b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
+#define __ARM_COMPUTE_CLFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h>
+#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+#include <arm_compute/runtime/CL/functions/CLCastBool.h>
+#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
+#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/CL/functions/CLNeg.h>
+#include <arm_compute/runtime/CL/functions/CLOneHot.h>
+#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+#include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
+#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
+#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
new file mode 100644
index 000000000..b1ee52bf9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
+#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+class ICLTensor;
+
+/** Function to calculate the index of the minimum or maximum values in a
+ *  tensor based on an axis.
+ *
+ * @note The default data type for an uninitialized output tensor is
+ *       signed 32-bit integer (S32). It is the user's responsibility to check
+ *       that the results do not overflow because the indices are computed
+ *       in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerEx : public IFunction
+{
+public:
+  /** Default Constructor.
+   *
+   * @param[in] memory_manager (Optional) Memory manager.
+   */
+  CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Input source tensor. Data types supported: QASYMM8/F16/F32.
+   * @param[in]  axis   Axis to find max/min index.
+   * @param[out] output Output source tensor. Data types supported: U32/S32.
+   * @param[in]  op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
+   * ARG_IDX_MIN
+   */
+  void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArgMinMaxLayerEx
+   *
+   * @param[in] input  Input source tensor info. Data types supported: QASYMM8/F16/F32.
+   * @param[in] axis   Axis to find max/min index.
+   * @param[in] output Output source tensor info. Data types supported: U32/S32.
+   * @param[in] op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
+   * ARG_IDX_MIN
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
+                         const ReductionOperation &op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<CLTensor> _results_vector;
+  CLTensor _not_reshaped_output;
+  std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
+  CLReshapeLayerKernel _reshape_kernel;
+  unsigned int _num_of_stages;
+  unsigned int _reduction_axis;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
new file mode 100644
index 000000000..88a9b00ec
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLBinaryLogicalOp : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported: U8, QASYMM8.
+   * @param[in]  input2  Source tensor2. Data types supported: U8 QASYMM8.
+   * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
new file mode 100644
index 000000000..d6150684a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLCastBool.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCastBool class
+ */
+
+#ifndef ARM_COMPUTE_CLCASTBOOL_H
+#define ARM_COMPUTE_CLCASTBOOL_H
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLCastBoolKernel.
+ * This converts the boolean input tensor to the output tensor's type.
+ */
+class CLCastBool : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's input and output
+   * @param[in]  input   Input tensor. Data types supported: U8
+   * @param[out] output  Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32.
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
new file mode 100644
index 000000000..409eaf593
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Function to run the deconvolution layer.
+ *
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perform a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input and pad is the amount of padding.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where:
+ *      width_input is the size of the first input dimension.
+ *      height_input is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
+ * And the following CPP kernels:
+ * -# @ref CLReverse
+ *
+ */
+class CLDirectTransposeConvLayer : public IFunction
+{
+public:
+  /** Constructor */
+  CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
+  /** Default move constructor */
+  CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
+  /** Default move assignment operator */
+  CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias         (Optional) The biases have one dimension.
+   *                             Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
+ * is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+ * the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+   *
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                 unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLDirectTransposeConvLayer
+   *
+   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in] bias         (Optional) The biases have one dimension.
+   *                         Data type supported: Should match @p input data type, except for input
+ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                         unsigned int invalid_right, unsigned int invalid_bottom,
+                         const WeightsInfo &weights_info = WeightsInfo());
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  CLDeconvolutionLayerUpsample _scale_f;
+  CLConvolutionLayer _conv_f;
+  CLReverse _flip_weights;
+
+  CLTensor _scaled_output;
+  ICLTensor *_original_weights;
+  CLTensor _weights_flipped;
+  CLTensor _flip_axis;
+
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
new file mode 100644
index 000000000..fbee7e40e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class CLEmbeddingLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
new file mode 100644
index 000000000..f3266f688
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__
+#define __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
+#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
+ * the following kernels:
+ *
+ *  -# @ref CLTransposeKernel
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * S8.
+   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
+   * supported: Same as @p input.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLFullyConnectedHybridLayerReshapeWeights
+   *
+   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * S8.
+   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
+   * supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following
+ * OpenCL kernels:
+ *
+ *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ * and transpose_weights is set to true ) (called once)
+ *  -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric)
+ *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedHybridLayer : public IFunction
+{
+public:
+  /** Constructor */
+  CLFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLFullyConnectedHybridLayer(const CLFullyConnectedHybridLayer &) = delete;
+  /** Default move constructor */
+  CLFullyConnectedHybridLayer(CLFullyConnectedHybridLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLFullyConnectedHybridLayer &operator=(const CLFullyConnectedHybridLayer &) = delete;
+  /** Default move assignment operator */
+  CLFullyConnectedHybridLayer &operator=(CLFullyConnectedHybridLayer &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
+                 ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLFullyConnectedHybridLayer
+   *
+   * @param[in]  input   Source tensor info. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output,
+                    bool retain_internal_weights);
+
+  MemoryGroup _memory_group;
+  CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
+  CLScaleFactorSymm8Kernel _scale_factor_kernel;
+  CLQuantizationSymmetricKernel _quant_input_kernel;
+  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+  CLMultiplyScaleFactorKernel _multiply_scale_kernel;
+  CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
+                                                                // add bias in
+                                                                // CLFullyConnectedHybridLayer
+  CLTensor _reshape_weights_output;
+  CLTensor _quantized_input;
+  CLTensor _scale_factor;
+  CLTensor _gemmlowp_output;
+  bool _are_weights_reshaped;
+  bool _accumulate_biases;
+  bool _is_prepared;
+  const ICLTensor *_original_weights;
+};
+}
+#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
new file mode 100644
index 000000000..e65a646dc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__
+#define __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
+ * the following kernels:
+ *
+ *  -# @ref CLTransposeKernel
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
+   * supported: Same as @p input.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLFullyConnectedLayerReshapeWeightsEx
+   *
+   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
+   * supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+namespace weights_transformations
+{
+/** Basic function to manage the reshape weights generated from @ref
+ * CLFullyConnectedLayerReshapeWeightsEx */
+class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights
+{
+public:
+  // Inherited method override
+  void run() override
+  {
+    _output.allocator()->allocate();
+    _func.run();
+    _reshape_run = true;
+  }
+
+  // Inherited method override
+  void release() override { _output.allocator()->free(); }
+
+  // Inherited method override
+  ICLTensor *get_weights() override { return &_output; }
+
+  // Inherited method override
+  uint32_t uid() override { return _uid; }
+
+  /** Configures the @ref CLFullyConnectedLayerReshapeWeightsEx function
+   *
+   * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
+   */
+  void configure(const ICLTensor *input) { _func.configure(input, &_output); }
+
+private:
+  static constexpr uint32_t _uid = 0x0;
+  CLTensor _output{};
+  CLFullyConnectedLayerReshapeWeightsEx _func{};
+};
+} // namespace weights_transformations
+
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following
+ * OpenCL kernels:
+ *
+ *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLFullyConnectedLayerReshapeWeightsEx (if @p are_weights_reshaped is set to false and
+ * transpose_weights is set to true ) (called once)
+ *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref
+ * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr,
+                          IWeightsManager *weights_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLFullyConnectedLayerEx(const CLFullyConnectedLayerEx &) = delete;
+  /** Default move constructor */
+  CLFullyConnectedLayerEx(CLFullyConnectedLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLFullyConnectedLayerEx &operator=(const CLFullyConnectedLayerEx &) = delete;
+  /** Default move assignment operator */
+  CLFullyConnectedLayerEx &operator=(CLFullyConnectedLayerEx &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
+                 ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLFullyConnectedLayerEx
+   *
+   * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias,
+                       ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
+  void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias,
+                         ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
+  void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias,
+                    ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
+
+  MemoryGroup _memory_group;
+  IWeightsManager *_weights_manager;
+  CLConvertFullyConnectedWeights _convert_weights;
+  weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed;
+  weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
+      _reshape_weights_managed_function;
+  CLFlattenLayer _flatten_layer;
+  CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
+  CLGEMM _mm_gemm;
+  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+  CLTensor _flatten_output;
+  CLTensor _converted_weights_output;
+  CLTensor _reshape_weights_output;
+  bool _are_weights_converted;
+  bool _are_weights_reshaped;
+  bool _is_fc_after_conv;
+  bool _is_quantized;
+  bool _is_prepared;
+  const ICLTensor *_original_weights;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..289ab167f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        CLFullyConnectedReshapingLayer.h
+ * @brief       This file contains CLFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class CLFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+  enum class KernelType
+  {
+    GENERAL,             //< General FC
+    PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
+  };
+
+public:
+  CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+      : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+        _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] weights The tensor that is filled with weight values
+   * @param[in] biases The tensor that is filled with biase values
+   * @param[in] output The destination tensor
+   * @param[in] needs_reshape Whether it needs to be reshaped or not
+   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+   * @return N/A
+   */
+  void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights,
+                 const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output,
+                 bool needs_reshape, const arm_compute::TensorShape &reshape,
+                 KernelType kernel_type);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+  /**
+   * @brief Prepare the operation
+   * @return N/A
+   */
+  void prepare(void) override;
+
+private:
+  const arm_compute::ICLTensor *_input;
+  const arm_compute::ICLTensor *_weights;
+  const arm_compute::ICLTensor *_biases;
+  arm_compute::ICLTensor *_output;
+
+  // buffer for reshaping input tensor
+  arm_compute::CLTensor _cl_buffer;
+
+private:
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  std::unique_ptr<arm_compute::IFunction> _cl_fc;
+  CLReshapeLayer _cl_reshape;
+  bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
new file mode 100644
index 000000000..b01ec4255
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLGatherEx.h
+ * @brief       This file contains CLGatherEx class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLGATHEREX_H__
+#define __ARM_COMPUTE_CLGATHEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to to run @ref CLGatherKernel.
+ */
+class CLGatherEx : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in]  input   An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices An indexes tensor. Data types supported: S32.
+   * @param[out] output  The output tensor, Data types supported: same as @p input.
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   * @return N/A
+ */
+  void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   *        of @ref CLGatherEx
+   * @param[in]  input   An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices An indexes tensor. Data types supported: S32.
+   * @param[out] output  The output tensor, Data types supported: same as @p input.
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
new file mode 100644
index 000000000..6618f5aa4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class CLHashtableLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
+                 ICLTensor *output, ICLTensor *hits);
+};
+}
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..887e7aaa5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref CLInstanceNormalizationLayerKernelEx
+ */
+class CLInstanceNormalizationLayerEx : public ICLSimpleFunction
+{
+public:
+  /** Default constructor */
+  CLInstanceNormalizationLayerEx();
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      beta    (Optional) The offset tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
+                 ICLTensor *beta = nullptr, float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLInstanceNormalizationLayerEx.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] beta    (Optional) The offset tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
new file mode 100644
index 000000000..8ec9aa307
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLNEG_H__
+#define __ARM_COMPUTE_CLNEG_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLNeg : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input  Source tensor. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
new file mode 100644
index 000000000..2bbfca821
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLONEHOT_H__
+#define __ARM_COMPUTE_CLONEHOT_H__
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+namespace arm_compute
+{
+class ICLTensor;
+/** Basic function to run @ref CLOneHotKernel */
+class CLOneHot : public IFunction
+{
+public:
+  /** Constructor */
+  CLOneHot();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHot(const CLOneHot &) = delete;
+  /** Default move constructor */
+  CLOneHot(CLOneHot &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLOneHot &operator=(const CLOneHot &) = delete;
+  /** Default move assignment operator */
+  CLOneHot &operator=(CLOneHot &&) = default;
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
+                 ICLTensor *output, int depth, int axis = -1);
+  /** Initialise the kernel's inputs and outputs with off_value being constant
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  off_value The PixelValue for off value. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+                 PixelValue off_value, int depth, int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLOneHotKernel
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[in]  output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  depth     The depth of the one hot dimension.
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                         const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                         int axis = -1);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLMemsetKernel _memset_kernel; /**< Memset kernel */
+  CLOneHotKernel _onehot_kernel; /**< OneHot kernel */
+  bool _has_to_memset;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
new file mode 100644
index 000000000..bb852e404
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLReduceOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLReduceOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform ReduceOperation
+ */
+class CLReduceOperation : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new ReduceOperation object
+   */
+  CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager);
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input     Source tensor. Data types supported: U8/S32/F32
+   * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in]  keep_dims If positive, retains reduced dimensions with length 1.
+   * @param[in]  op        Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
+                 bool keep_dims, ReductionOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperation.
+   * @param[in] input     Source tensor info. Data types supported: U8/S32/F32
+   * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in] axis      Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+   * @param[in] op        Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const std::set<uint32_t> &axis, bool keep_dims,
+                         const ReductionOperation &op);
+
+  /**
+   * @brief Run the OpenCL kernel for this operation
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  ICLTensor *_input;
+  ICLTensor *_output;
+  std::set<uint32_t> _axis;
+  bool _keep_dims;
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
+  CLReshapeLayer _reshape;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
new file mode 100644
index 000000000..bb741d98d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSPLITVEX__
+#define __ARM_COMPUTE_CLSPLITVEX__
+
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/core/Types.h"
+#include <vector>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSplitVKernel */
+class CLSplitVEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLSplitVEx();
+  /** Configure the split CL kernel
+   *
+   * @param[in]  input       The input tensor to split. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+   * @param[in]  size_splits A 1-D tensor containing the number of tensor values per split
+   * @param[out] outputs     A vector containing the output tensor. Data types supported: Same as @p
+   * input
+   *                         The output tensors should match the input tensor dimensions for all
+   * shape dimensions apart
+   *                         from the split dimension.
+   * @param[in]  split_dim   Integer value representing the input tensor dimension along which to
+   * split
+   * @param[in]  num_splits  Number of splits
+   */
+  void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
+                 const std::vector<ICLTensor *> &outputs, unsigned int num_splits);
+
+  void run() override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_size_splits;
+  std::vector<ICLTensor *> _outputs;
+  unsigned int _num_splits;
+  std::vector<CLSlice> _slice_functions;
+};
+}
+#endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
new file mode 100644
index 000000000..e301a5152
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLTopKV2.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLTopKV2 class
+ */
+#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
+#define __ARM_COMPUTE_CLTOPK_V2_H__
+
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute TopKV2 operation.
+ */
+class CLTopKV2 : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new CLTopKV2 object
+   */
+  CLTopKV2();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLTopKV2(const CLTopKV2 &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLTopKV2 &operator=(const CLTopKV2 &) = delete;
+
+  /**
+   * @brief Construct a new CLTopKV2 object by using copy constructor
+   * @param[in] CLTopKV2 object to move
+   */
+  CLTopKV2(CLTopKV2 &&) = default;
+
+  /**
+   * @brief Assign a CLTopKV2 object.
+   * @param[in] CLTopKV2 object to assign. This object will be moved.
+   */
+  CLTopKV2 &operator=(CLTopKV2 &&) = default;
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
+   * @param[in]  input     Input image. Data types supported: U8/S16/F32.
+   * @param[in]  k         The value of `k`.
+   * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if
+   * input type is F32.
+   * @param[out] indices   Indices related to top k values. Data types supported: S32 if input type
+   * is U8/S16, F32 if input type is F32.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                 int total_bits = 32, int bits = 4);
+
+  /**
+   * @brief Run the kernels contained in the function
+   * Depending on the value of the following environment variables it works differently:
+   *   - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
+   *     quick sort on GPU is used.
+   *   - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
+   *     radix sort on GPU is used.
+   *   - For other value, TopKV2 runs on CPU
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  void run_on_cpu();
+  void run_on_gpu();
+  void run_on_gpu_single_quicksort();
+
+  uint32_t _k;
+  uint32_t _total_bits;
+  uint32_t _bits;
+  uint32_t _radix;
+  uint32_t _hist_buf_size;
+  uint32_t _glob_sum_buf_size;
+  uint32_t _n;
+
+  ICLTensor *_input;
+  ICLTensor *_values;
+  ICLTensor *_indices;
+
+  cl::Buffer _qs_idx_buf;
+  cl::Buffer _qs_temp_buf;
+  cl::Buffer _hist_buf;
+  cl::Buffer _glob_sum_buf;
+  cl::Buffer _temp_buf;
+  cl::Buffer _first_negative_idx_buf;
+  cl::Buffer _in_key_buf;
+  cl::Buffer _out_key_buf;
+  cl::Buffer _in_ind_buf;
+  cl::Buffer _out_ind_buf;
+
+  cl::Buffer *_p_in_key_buf;
+  cl::Buffer *_p_out_key_buf;
+  cl::Buffer *_p_in_ind_buf;
+  cl::Buffer *_p_out_ind_buf;
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+  CLTopKV2Single _qs_kernel;
+  CLTopKV2Init _init_kernel;
+  CLRadixSortHistogram _hist_kernel;
+  CLRadixSortScanHistogram _scan_hist_kernel;
+  CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
+  CLRadixSortPasteHistogram _paste_hist_kernel;
+  CLRadixSortReorder _reorder_kernel;
+  CLTopKV2FindFirstNegative _find_first_negative_kernel;
+  CLTopKV2ReorderNegatives _reorder_negatives_kernel;
+  CLTopKV2Store _store_kernel;
+#endif
+};
+}
+#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
new file mode 100644
index 000000000..5fb102e47
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
+ * kernels/functions:
+ *
+ * -# @ref CLGEMMDeconvolutionLayer
+ * -# @ref CLDirectTransposeConvLayer
+ */
+class CLTransposeConvLayer : public IFunction
+{
+public:
+  /** Default constructor */
+  CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
+ * as @p input.
+   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
+ * is described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                 const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                 unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs. Data types supported:
+ * QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
+ * Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+ * the @p input.
+   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+   *
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+                 unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLTransposeConvLayer
+   *
+   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
+ * @p input.
+   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
+ * described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, ITensorInfo *output,
+                         const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                         unsigned int invalid_bottom,
+                         const WeightsInfo &weights_info = WeightsInfo());
+
+  static DeconvolutionMethod
+  get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+                           const ITensorInfo *bias, ITensorInfo *output,
+                           const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                           unsigned int invalid_bottom, const WeightsInfo &weights_info);
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  std::unique_ptr<IFunction> _function;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
new file mode 100644
index 000000000..efc296d6c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
+#define __ARM_COMPUTE_NEFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+#include <arm_compute/runtime/NEON/functions/NECastBool.h>
+#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
+#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEOneHot.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
+#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
new file mode 100644
index 000000000..026d30098
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+class NEBinaryLogicalOperation : public INESimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and conversion policy.
+   *
+   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8.
+   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
+   * @param[in]      op     Binary Logical Operation to be performed.
+   */
+  void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   * @param[in] op     Binary Logical Operation to be performed.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, BinaryLogicalOperation op);
+};
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and conversion policy.
+   *
+   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8
+   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
+   */
+  void configure(ITensor *input1, ITensor *input2, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output);
+};
+
+/** Basic function to run equal comparison. */
+using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+/** Basic function to run not equal comparison. */
+using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
new file mode 100644
index 000000000..c8b08af8d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTBOOL_H__
+#define __ARM_COMPUTE_NECASTBOOL_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to run @ref NECastBoolKernel.
+ */
+class NECastBool : public INESimpleFunction
+{
+public:
+  /** Initialize the function's source, destination
+   *
+   * Valid conversions Input -> Output :
+   *
+   *   - U8 -> U8, S8, U16, S16, U32, S32, F32, F16
+   *
+   * @param[in]  input  The input tensor to convert. Data types supported: U8
+   * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECastBool
+   *
+   * @param[in] input  Source tensor info. Data types supported: U8.
+   * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
new file mode 100644
index 000000000..63f7714aa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file NEEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class NEEmbeddingLookup : public INESimpleFunctionNoBorder
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   * input. Data types supported: S32.
+   * @return N/A
+   */
+  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+   *
+   * @param[in] input  Source tensor info. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   * @param[in] output Lookups tensor info. Data types supported: S32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
new file mode 100644
index 000000000..56548a479
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
+ * the following kernels:
+ *
+ *  -# @ref NETransposeKernel
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Destination tensor. Data type supported: Same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedHybridLayerReshapeWeights
+   *
+   * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in] output Destination tensor info. Data type supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ * and transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayer : public IFunction
+{
+public:
+  /** Constructor */
+  NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete;
+  /** Default move constructor */
+  NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete;
+  /** Default move assignment operator */
+  NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedHybridLayer
+   *
+   * @param[in]  input   Source tensor info. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+  MemoryGroup _memory_group;
+  NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+  NEQuantizationSymmetricKernel _quant_input_kernel;
+  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+  NEMultiplyScaleFactorKernel _multiply_scale_kernel;
+  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+  Tensor _reshape_weights_output;
+  Tensor _quantized_input;
+  Tensor _scale_factor;
+  Tensor _gemmlowp_output;
+  const ITensor *_original_weights;
+  bool _are_weights_reshaped;
+  bool _accumulate_biases;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
new file mode 100644
index 000000000..8f98f220a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
+ * transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ * @note  The difference from NEFullyConnectedLayer is that this class supports weights as input
+ * with performance loss.
+ */
+class NEFullyConnectedLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
+  /** Default move constructor */
+  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
+  /** Default move assignment operator */
+  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedLayerEx
+   *
+   * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+  MemoryGroup _memory_group;
+  NEFlattenLayerKernel _flatten_kernel;
+  NEConvertFullyConnectedWeights _convert_weights;
+  NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
+  NEGEMM _mm_gemm;
+  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+  Tensor _flatten_output;
+  Tensor _gemmlowp_output;
+  Tensor _converted_weights_output;
+  Tensor _reshape_weights_output;
+  const ITensor *_original_weights;
+  bool _are_weights_converted;
+  bool _are_weights_reshaped;
+  bool _is_fc_after_conv;
+  bool _accumulate_biases;
+  bool _is_quantized;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..18cb61bf9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        NEFullyConnectedReshapingLayer.h
+ * @brief       This file contains NEFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+#include <arm_compute/runtime/Tensor.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class NEFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+  enum class KernelType
+  {
+    GENERAL,             //< General FC
+    PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
+  };
+
+public:
+  NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+      : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+        _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] weights The tensor that is filled with weight values
+   * @param[in] biases The tensor that is filled with biase values
+   * @param[in] output The destination tensor
+   * @param[in] needs_reshape Whether it needs to be reshaped or not
+   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+   * @param[in] kernel_type The kernel type for actual FullyConnected layer
+   * @return N/A
+   */
+  void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights,
+                 const arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                 bool needs_reshape, const arm_compute::TensorShape &reshape,
+                 KernelType kernel_type);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+  /**
+   * @brief Prepare the operation
+   * @return N/A
+   */
+  void prepare(void) override;
+
+private:
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  const arm_compute::ITensor *_input;
+  const arm_compute::ITensor *_weights;
+  const arm_compute::ITensor *_biases;
+  arm_compute::ITensor *_output;
+
+  // buffer for reshaping input tensor
+  arm_compute::Tensor _neon_buffer;
+
+private:
+  std::unique_ptr<arm_compute::IFunction> _neon_fc;
+  NEReshapeLayer _neon_reshape;
+  bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
new file mode 100644
index 000000000..155a1b837
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHEREX_H__
+#define __ARM_COMPUTE_NEGATHEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEGatherKernelEx */
+class NEGatherEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
new file mode 100644
index 000000000..521a05ad9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file NEHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class NEHashtableLookup : public INESimpleFunctionNoBorder
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input. Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+                 ITensor *hits);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+   *
+   * @param[in]  lookups  Lookups 1D tensor info.
+   *                      Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  output   Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  hits     Hits 1D tensor info. A boolean tensor that indicates whether the lookup
+   * hits (True) or not (False). Data types supported: U8/QASYMM8
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+};
+}
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..18e813923
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref NEInstanceNormalizationLayerKernelEx
+ */
+class NEInstanceNormalizationLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
+   * Defaults to 1.0
+   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                 float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEInstanceNormalizationLayer.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
+   * to 1.0
+   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEInstanceNormalizationLayerKernelEx _normalization_kernel;
+  bool _is_nchw;
+  NEPermute _permute_input;
+  NEPermute _permute_output;
+  Tensor _permuted_input;
+  Tensor _permuted_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
new file mode 100644
index 000000000..b2ea6270f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEONEHOT_H__
+#define __ARM_COMPUTE_NEONEHOT_H__
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Basic function to run @ref NEOneHotKernel */
+class NEOneHot : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32
+   * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up
+   * to 3. Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
+   * U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+   * Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   * The value must be in range [-indices.rank , indices.rank)
+   */
+  void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                 const ITensor *off_value, ITensor *output, int axis = -1);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEOneHotKernel
+   *
+ * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
+ * up to 3. Must be one of the following types: U32/S32
+ * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+ * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                         const ITensorInfo *on_value, const ITensorInfo *off_value,
+                         const ITensorInfo *output, int axis = -1);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
new file mode 100644
index 000000000..91eec815c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceOperation : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  op             Reduce operation to perform.
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output,
+                 ReductionOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReduceOperation
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  op             Reduce operation to perform.
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output, ReductionOperation op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<NEReductionOperation> _reduction_kernels;
+  std::vector<Tensor> _reduced_outs;
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
new file mode 100644
index 000000000..48b416923
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceSum : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<NEReductionOperation> _reduction_kernels;
+  std::vector<Tensor> _reduced_outs;
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
new file mode 100644
index 000000000..24ff5dac9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Function to run the deconvolution layer.
+ *
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perfrom a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finaly a is a user
+ * specified value where a < stride - 1 that increases the padding top and right of the input image.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where
+ *      width is the size of the first input dimension.
+ *      height is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
+ *
+ * This function calls the following NEON kernels/functions:
+ *
+ * -# @ref CPPUpsampleEx
+ * -# @ref NEConvolutionLayer
+ * -# @ref NEPermute
+ * -# @ref NEReverse
+ *
+ */
+class NETransposeConvLayer : public IFunction
+{
+public:
+  /** Constructor */
+  NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NETransposeConvLayer(const NETransposeConvLayer &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
+  /** Allow instances of this class to be moved */
+  NETransposeConvLayer(NETransposeConvLayer &&) = default;
+  /** Allow instances of this class to be moved */
+  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+  /** Default destructor */
+  virtual ~NETransposeConvLayer() = default;
+
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+   * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
+ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
+ * for F16 input.
+   * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
+ * input.
+   * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+ * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
+   *
+   */
+  void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right,
+                 unsigned int invalid_bottom);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * NETransposeConvLayer
+   *
+   * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+   * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
+ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+   * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
+ * input.
+   * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, const ITensorInfo *output,
+                         const PadStrideInfo &info, unsigned int invalid_right,
+                         unsigned int invalid_bottom);
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEConvolutionLayer _conv_f;
+  CPPUpsample _upsample_f;
+  NEReverse _flip_weights;
+  Tensor _scaled_output;
+  Tensor _weights_flipped;
+  Tensor _flip_axis;
+  const ITensor *_original_weights;
+  ITensor *_input;
+  PadStrideInfo _info;
+  bool _is_prepared;
+};
+} // arm_compute
+#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py
new file mode 100755
index 000000000..f37c2a957
--- /dev/null
+++ b/compute/ARMComputeEx/resolve_includes.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import collections
+import os.path
+import re
+import subprocess
+import glob
+
+
+def resolve_includes(target, source):
+    # File collection
+    FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
+
+    # Include pattern
+    pattern = re.compile("#include \"(.*)\"")
+
+    # Get file contents
+    files = []
+    for i in range(len(source)):
+        src = source[i]
+        dst = target[i]
+        f = open(src)
+        cts = f.read()
+        f.close()
+        contents = cts.splitlines()
+        entry = FileEntry(target_name=dst, file_contents=contents)
+        files.append((os.path.basename(src), entry))
+
+    # Create dictionary of tupled list
+    files_dict = dict(files)
+
+    # Check for includes (can only be files in the same folder)
+    final_files = []
+    for file in files:
+        done = False
+        tmp_file = file[1].file_contents
+        print(file[1].target_name)
+        while not done:
+            file_count = 0
+            updated_file = []
+            for line in tmp_file:
+                found = pattern.search(line)
+                if found:
+                    include_file = found.group(1)
+                    data = files_dict[include_file].file_contents
+                    updated_file.extend(data)
+                else:
+                    updated_file.append(line)
+                    file_count += 1
+
+            # Check if all include are replaced.
+            if file_count == len(tmp_file):
+                done = True
+
+            # Update temp file
+            tmp_file = updated_file
+
+        # Append and prepend string literal identifiers and add expanded file to final list
+        tmp_file.insert(0, "R\"(\n")
+        tmp_file.append("\n)\"")
+        entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
+        final_files.append((file[0], entry))
+
+    # Write output files
+    for file in final_files:
+        with open(file[1].target_name, 'w+') as out_file:
+            out_file.write("\n".join(file[1].file_contents))
+
+
+# Generate embed files
+cl_files = glob.glob('src/core/CL/cl_kernels/*.cl')
+cl_files += glob.glob('src/core/CL/cl_kernels/*.h')
+
+# DEBUG: print cl files
+print("cl_files:")
+print(cl_files)
+
+embed_files = [f + "embed" for f in cl_files]
+print("embed_files:")
+print(embed_files)
+
+resolve_includes(embed_files, cl_files)
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 000000000..81d0cb70f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+    // ARMComputeEx kernels
+    {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
+    {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
+    {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
+    {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
+    {"binary_logical_op", "binary_logical_op.cl"},
+    {"cast_bool", "cast.cl"},
+    {"embedding_lookup", "embedding_lookup.cl"},
+    {"gather_ex", "gather_ex.cl"},
+    {"gather_ex_1d", "gather_ex.cl"},
+    {"gather_ex_1d_out", "gather_ex.cl"},
+    {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+    {"hashtable_lookup", "hashtable_lookup.cl"},
+    {"instance_normalization_ex", "instance_normalization_ex.cl"},
+    {"multiply_scale_factor", "multiply_scale_factor.cl"},
+    {"neg_tensor", "neg_tensor.cl"},
+    {"one_hot", "one_hot.cl"},
+    {"one_hot_only_on_value", "one_hot.cl"},
+    {"quantization_symm8", "quantization_symm8.cl"},
+    {"reduce_min_max", "reduce_operation.cl"},
+    {"reduce_sum_mean", "reduce_operation.cl"},
+    {"topkv2_init", "topkv2.cl"},
+    {"topkv2_find_first_negative", "topkv2.cl"},
+    {"topkv2_reorder_negatives", "topkv2.cl"},
+    {"topkv2_store", "topkv2.cl"},
+    {"radixsort_histogram", "topkv2_radixsort.cl"},
+    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+    {"radixsort_reorder", "topkv2_radixsort.cl"},
+    {"topkv2_quicksort", "topkv2_quicksort.cl"},
+    {"scale_factor_symm8", "scale_factor.cl"},
+};
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+    {
+        "arg_min_max_ex.cl",
+#include "./cl_kernels/arg_min_max_ex.clembed"
+    },
+    {
+        "cast.cl",
+#include "./cl_kernels/cast.clembed"
+    },
+    {
+        "embedding_lookup.cl",
+#include "./cl_kernels/embedding_lookup.clembed"
+    },
+    {
+        "gather_ex.cl",
+#include "./cl_kernels/gather_ex.clembed"
+    },
+    {
+        "gemmlowp_ex.cl",
+#include "./cl_kernels/gemmlowp_ex.clembed"
+    },
+    {
+        "hashtable_lookup.cl",
+#include "./cl_kernels/hashtable_lookup.clembed"
+    },
+    {
+        "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+    },
+    {
+        "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+    },
+    {
+        "instance_normalization_ex.cl",
+#include "./cl_kernels/instance_normalization_ex.clembed"
+    },
+    {
+        "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+    },
+    {
+        "multiply_scale_factor.cl",
+#include "./cl_kernels/multiply_scale_factor.clembed"
+    },
+    {
+        "neg_tensor.cl",
+#include "./cl_kernels/neg_tensor.clembed"
+    },
+    {
+        "one_hot.cl",
+#include "./cl_kernels/one_hot.clembed"
+    },
+    {
+        "quantization_symm8.cl",
+#include "./cl_kernels/quantization_symm8.clembed"
+    },
+    {
+        "reduce_operation.cl",
+#include "./cl_kernels/reduce_operation.clembed"
+    },
+    {
+        "scale_factor.cl",
+#include "./cl_kernels/scale_factor.clembed"
+    },
+    {
+        "topkv2.cl",
+#include "./cl_kernels/topkv2.clembed"
+    },
+    {
+        "topkv2_radixsort.cl",
+#include "./cl_kernels/topkv2_radixsort.clembed"
+    },
+    {
+        "topkv2_quicksort.cl",
+#include "./cl_kernels/topkv2_quicksort.clembed"
+    },
+
+#endif /* EMBEDDED_KERNELS */
+};
+
+CLKernelLibraryEx::CLKernelLibraryEx()
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+  opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
+                         // CLKernelLibraryEx is built
+}
+
+CLKernelLibraryEx &CLKernelLibraryEx::get()
+{
+  static CLKernelLibraryEx _kernel_library;
+  return _kernel_library;
+}
+
+Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
+                                        const StringSet &build_options_set) const
+{
+  // Find which program contains the kernel
+  auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+  if (_kernel_program_map.end() == kernel_program_it)
+  {
+    ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+  }
+  std::string concat_str;
+
+  if (fp16_supported())
+  {
+    concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+  }
+
+  if (get_cl_version(_device) == CLVersion::CL20)
+  {
+    concat_str += " -cl-std=CL2.0 ";
+  }
+  else if (arm_non_uniform_workgroup_supported(_device))
+  {
+    concat_str += " -cl-arm-non-uniform-work-group-size ";
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+  }
+
+  // Check if the program has been built before with same build options.
+  const std::string program_name = kernel_program_it->second;
+  const std::string build_options = stringify_set(build_options_set) + concat_str;
+
+  const std::string built_program_name = program_name + "_" + build_options;
+  auto built_program_it = _built_programs_map.find(built_program_name);
+
+  cl::Program cl_program;
+
+  if (_built_programs_map.end() != built_program_it)
+  {
+    // If program has been built, retrieve to create kernel from it
+    cl_program = built_program_it->second;
+  }
+  else
+  {
+    // Get program
+    Program program = load_program(program_name);
+
+    // Build program
+    cl_program = program.build(build_options);
+
+    // Add built program to internal map
+    _built_programs_map.emplace(built_program_name, cl_program);
+  }
+
+  // Create and return kernel
+  return Kernel(kernel_name, cl_program);
+}
+
+void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
+                                          cl::Program program)
+{
+  _built_programs_map.emplace(built_program_name, program);
+}
+
+bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
+
+bool CLKernelLibraryEx::int64_base_atomics_supported() const
+{
+  return device_supports_extension(_device, "cl_khr_int64_base_atomics");
+}
+
+const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
+{
+  const auto program_it = _programs_map.find(program_name);
+
+  if (program_it != _programs_map.end())
+  {
+    return program_it->second;
+  }
+
+  Program program;
+
+#ifdef EMBEDDED_KERNELS
+  const auto program_source_it = _program_source_map.find(program_name);
+
+  if (_program_source_map.end() == program_source_it)
+  {
+    ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
+  }
+
+  program = Program(_context, program_name, program_source_it->second);
+#else  /* EMBEDDED_KERNELS */
+  // Check for binary
+  std::string source_name = _kernel_path + program_name;
+  std::string binary_name = source_name + "bin";
+
+  if (std::ifstream(binary_name).is_open())
+  {
+    const std::string program_binary = read_file(binary_name, true);
+    program = Program(_context, _device, program_name,
+                      std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+  }
+  else if (std::ifstream(source_name).is_open())
+  {
+    program = Program(_context, program_name, read_file(source_name, false));
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
+  }
+#endif /* EMBEDDED_KERNELS */
+
+  // Insert program to program map
+  const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+  return new_program.first->second;
+}
+
+std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
+{
+  std::string concat_set;
+
+#ifndef EMBEDDED_KERNELS
+  concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+  // Concatenate set
+  for (const auto &el : s)
+  {
+    concat_set += " " + el;
+  }
+
+  return concat_set;
+}
+
+std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
+{
+  const auto program_source_it = _program_source_map.find(program_name);
+
+  if (program_source_it == _program_source_map.end())
+  {
+    ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
+  }
+
+  return program_source_it->second;
+}
+
+size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
+{
+  size_t result;
+
+  size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+  ARM_COMPUTE_ERROR_ON_MSG(
+      err != 0,
+      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+  ARM_COMPUTE_UNUSED(err);
+
+  return result;
+}
+
+cl::NDRange CLKernelLibraryEx::default_ndrange() const
+{
+  //    GPUTarget   _target = get_target_from_device(_device);
+  cl::Device device = cl::Device::getDefault();
+  GPUTarget _target = get_target_from_device(device);
+  cl::NDRange default_range;
+
+  switch (_target)
+  {
+    case GPUTarget::MIDGARD:
+    case GPUTarget::T600:
+    case GPUTarget::T700:
+    case GPUTarget::T800:
+      default_range = cl::NDRange(128u, 1);
+      break;
+    default:
+      default_range = cl::NullRange;
+  }
+
+  return default_range;
+}
+
+std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
new file mode 100644
index 000000000..0a014d15c
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) \
+  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
+#define ISLESS(x, y) \
+  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
+#if defined(WIDTH)
+#if defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
+                                             __global const DATA_TYPE_OUTPUT *prev_res,
+                                             const int x_idx)
+{
+  int end_elem = (x_idx + 1) * 16;
+  if (end_elem > WIDTH)
+  {
+    end_elem = WIDTH - x_idx * 16;
+  }
+  DATA_TYPE_OUTPUT res = prev_res[0];
+  for (int x_v = 1; x_v < end_elem; ++x_v)
+  {
+    res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
+  }
+  return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+  DATA_TYPE_OUTPUT res = 0;
+  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+  {
+    res = select(res, x_v, *(input + x_v) < *(input + res));
+  }
+  return res;
+#else  // WIDTH >= 16
+  int x_elem = x_idx * 16;
+  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+  x_elem -= x_goback;
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  in = vload16(0, input - x_goback);
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+  idx_sel = (in.s01234567 <= in.s89abcdef);
+  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
+  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+  idx_sel.s0123 = (in.s0123 < in.s4567) ||
+                  (in.s0123 == in.s4567 &&
+                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
+  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+  idx_sel.s01 =
+      (in.s01 < in.s23) ||
+      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+  in.s01 = select(in.s23, in.s01, idx_sel.s01);
+  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+  idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+  return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MIN)
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
+                                             __global const DATA_TYPE_OUTPUT *prev_res,
+                                             const int x_idx)
+{
+  int end_elem = (x_idx + 1) * 16;
+  if (end_elem > WIDTH)
+  {
+    end_elem = WIDTH - x_idx * 16;
+  }
+  DATA_TYPE_OUTPUT res = prev_res[0];
+  unsigned int res_int = res;
+  DATA_TYPE_OUTPUT condition_check2;
+  for (int x_v = 1; x_v < end_elem; ++x_v)
+  {
+    int i1 = prev_res[x_v];
+    condition_check2 = *(input + i1) > *(input + res_int);
+    res = select(res, prev_res[x_v], condition_check2);
+  }
+  return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+  DATA_TYPE_OUTPUT res = 0;
+  unsigned int i1;
+  unsigned int i2;
+  DATA_TYPE_OUTPUT condition_check;
+  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+  {
+    i1 = x_v;
+    i2 = res;
+    condition_check = *(input + i1) > *(input + i2);
+    res = select(res, x_v, condition_check);
+  }
+  return res;
+#else  // WIDTH >= 16
+  int x_elem = x_idx * 16;
+  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+  x_elem -= x_goback;
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  in = vload16(0, input - x_goback);
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+  idx_sel = (in.s01234567 >= in.s89abcdef);
+  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
+  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+  idx_sel.s0123 = (in.s0123 > in.s4567) ||
+                  (in.s0123 == in.s4567 &&
+                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
+  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+  idx_sel.s01 =
+      (in.s01 > in.s23) ||
+      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+  in.s01 = select(in.s23, in.s01, idx_sel.s01);
+  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+  idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+  return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX)
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
+ * using -DPREV_OUTPUT
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
+ * -DDATA_TYPE_OUTPUT=uint
+ * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
+ * ArgMax
+ * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
+ * ArgMin
+ *
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] src_stride_x                              Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] src_step_x                                src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the
+ * source tensor
+ * @param[in] prev_res_ptr                              (Optional) Pointer to previous results
+ * tensor. Supported data types: U32/S32
+ * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X
+ * dimension (in bytes)
+ * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y
+ * dimension (in bytes)
+ * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element
+ * in the previous results tensor
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result
+ * values. Supported data types: U32/S32
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension
+ * (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension
+ * (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
+ * source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
+ */
+__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
+#if defined(PREV_OUTPUT)
+                               IMAGE_DECLARATION(prev_res),
+#endif // defined(PREV_OUTPUT)
+                               IMAGE_DECLARATION(partial_res),
+                               __local DATA_TYPE_OUTPUT *local_results)
+{
+#if defined(PREV_OUTPUT)
+  Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+  Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
+#else  // !defined(PREV_OUTPUT)
+  Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#endif // defined(PREV_OUTPUT)
+  Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+
+  unsigned int lsize = get_local_size(0);
+  unsigned int lid = get_local_id(0);
+
+  const uint x_idx = get_global_id(0);
+  const uint y_idx = get_global_id(1);
+  const __global DATA_TYPE *src_in_row =
+      (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes +
+                                   y_idx * src_step_y);
+
+  for (unsigned int y = 0; y < get_local_size(1); ++y)
+  {
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_max_prev_out(
+        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#else  // defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_min_prev_out(
+        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+    local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Looking for the next highest power of 2 (maximum value of lsize is 8)
+    unsigned int middle = lsize - 1;
+    middle |= middle >> 1;
+    middle |= middle >> 2;
+    middle += 1;
+    // Perform parallel reduction
+    DATA_TYPE_OUTPUT condition_check3;
+    for (unsigned int i = middle; i > 0; i >>= 1)
+    {
+      if (lid < i && lid + i < lsize)
+      {
+        DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
+        DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
+#if defined(ARG_MAX)
+        condition_check3 =
+            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
+        local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
+#else  // defined(ARG_MIN)
+        local_results[lid] = select(
+            local_results[lid], local_results[lid + i],
+            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+    {
+      ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+    }
+  }
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
+ * -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
+ * -DDATA_TYPE_OUTPUT=uint
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
+{
+  Image src = CONVERT_TO_IMAGE_STRUCT(src);
+  Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  indx = 0;
+  for (unsigned int y = 1; y < HEIGHT; ++y)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in =
+        CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+    indx = select(indx, y, cond_conv);
+    res = select(res, in, CONDITION_TO_USE(in, res));
+  }
+
+  // Store result
+  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
+                VEC_DATA_TYPE(DATA_TYPE, 16));
+
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  indx = 0;
+  for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
+                 VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+    indx = select(indx, z, cond_conv);
+    res = select(res, in, CONDITION_TO_USE(in, res));
+  }
+
+  // Store result
+  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+  VEC_DATA_TYPE(DATA_TYPE, 16)
+  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
+                VEC_DATA_TYPE(DATA_TYPE, 16));
+
+  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+  indx = 0;
+  for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
+                 VEC_DATA_TYPE(DATA_TYPE, 16));
+
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+    indx = select(indx, w, cond_conv);
+    res = select(res, in, CONDITION_TO_USE(in, res));
+  }
+
+  // Store result
+  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
new file mode 100644
index 000000000..e249663bc
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OP_CODE) && defined(DATA_TYPE)
+/** returns truth value of the two input tensors for BINARY LOGICAL OP.
+ *  where BINARY LOGICAL OP can be AND, OR.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size.
+ *            e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input2_ptr                            Pointer to the source tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ */
+__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2),
+                                TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+  Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if OP_CODE == 1 // LOGICAL AND
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
+               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+   0, (__global DATA_TYPE *)output.ptr);
+
+#elif OP_CODE == 2 // LOGICAL OR
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
+               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+   0, (__global DATA_TYPE *)output.ptr);
+
+#else // OP NOT SUPPORTED
+  return
+
+#endif
+}
+#endif // if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..3b0a175a4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs a up-scaling depth conversion for boolean type input.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and
+ * -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note The integer shift amount value need to be passed at compile time using -DSHIFT:
+ * e.g. -DSHIFT=7
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
+ * U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ * types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out))
+{
+  // Get pixels pointer
+  Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+  // Load data
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+   (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
new file mode 100644
index 000000000..92e5dfbee
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
+ *                                                   types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
+ *                                                   bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ *                                                   vector
+ */
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+  // lookup ids for based on the tensor dimensions
+  int lup_id[4] = {0};
+
+  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+                              : get_global_id(0);
+  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+                              : get_global_id(1);
+  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+                              : get_global_id(2) % DEPTH_OUT;
+  lup_id[3] = (NUM_DIMS == 4)
+                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                  : get_global_id(2) / DEPTH_OUT;
+
+  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+   (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
new file mode 100644
index 000000000..2236021f1
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ *            -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data
+ *                                                   types: S32
+ * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the
+ *                                                   destination tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination
+ *                                                   tensor
+ */
+__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
+                        TENSOR4D_DECLARATION(output))
+{
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+  const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif INDICES_DIM == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+#elif INDICES_DIM == 3
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+#endif
+#elif AXIS == 1
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif INDICES_DIM == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+#elif INDICES_DIM == 3
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+#endif
+#elif AXIS == 2
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif INDICES_DIM == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+#endif
+#elif AXIS == 3
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif
+#endif // AXIS
+
+  *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
new file mode 100644
index 000000000..80ba73d1d
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \
+    defined(COLS_A)
+#define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B
+ * (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type:
+ * QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type:
+ * same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+                                     IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                     )
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension
+  // in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  int end_row_vec_a = src_addr.s0 + COLS_A;
+
+  VECTOR_INT acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VECTOR_INT acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VECTOR_INT acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VECTOR_INT acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  VECTOR_INT acc4 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+  {
+    // Load values from matrix A
+    char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    char2 a4 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    // Load values from matrix B
+    VECTOR_CHAR b0 =
+        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+    VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
+        0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    // Accumulate
+    acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0;
+    acc0 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1.s0;
+    acc1 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2.s0;
+    acc2 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3.s0;
+    acc3 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4.s0;
+    acc4 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a4.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+  {
+    // Load values from matrix A
+    char a0 = *(__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    char a1 = *(__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    char a2 = *(__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    char a3 = *(__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    char a4 = *(__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    // Load values from matrix B
+    VECTOR_CHAR b0 =
+        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+
+    // Accumulate
+    acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  }
+
+  const int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension
+  // in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) +
+                (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+               (uint8)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
+  // Store the result
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst.ptr += z * dst_stride_z;
+
+  // Store the result
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) &&
+       // defined(COLS_A)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
new file mode 100644
index 000000000..a4f7dbd48
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
+ *                                                   types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
+ *                                                   bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ *                                                   vector
+ */
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+  int lup_id[4] = {0};
+
+  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+                              : get_global_id(0);
+  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+                              : get_global_id(1);
+  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+                              : get_global_id(2) % DEPTH_OUT;
+  lup_id[3] = (NUM_DIMS == 4)
+                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                  : get_global_id(2) / DEPTH_OUT;
+
+  if (lup_id[NUM_DIMS - 1] < 0)
+  {
+    VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+    return;
+  }
+
+  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+   (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 000000000..e07a25ec9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+    defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
+#define EXPAND(x) x
+
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each
+ * element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create
+// defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)                                        \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
+      uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                               \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                            \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
+      uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                            \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
+      uint name##_step_w, uint name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
+  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                             name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                          \
+  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                            name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)                                                     \
+  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                            name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+                                          name##_stride_x, name##_step_x, name##_stride_y,  \
+                                          name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                             \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes,        \
+                                          name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
+                                          name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+                                          name##_stride_x, name##_step_x, name##_stride_y,  \
+                                          name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                          \
+  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                               name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                  \
+  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                \
+  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                               name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                        \
+  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0,     \
+                               mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+  int stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+                                         uint stride_x, uint step_x)
+{
+  Vector vector = {
+      .ptr = ptr,
+      .offset_first_element_in_bytes = offset_first_element_in_bytes,
+      .stride_x = stride_x,
+  };
+  vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+  return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+                                       uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+  Image img = {.ptr = ptr,
+               .offset_first_element_in_bytes = offset_first_element_in_bytes,
+               .stride_x = stride_x,
+               .stride_y = stride_y};
+  img.ptr +=
+      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+  return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint offset_first_element_in_bytes,
+                                                     uint stride_x, uint step_x, uint stride_y,
+                                                     uint step_y, uint stride_z, uint step_z)
+{
+  Image img = {.ptr = ptr,
+               .offset_first_element_in_bytes = offset_first_element_in_bytes,
+               .stride_x = stride_x,
+               .stride_y = stride_y};
+  img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
+             get_global_id(1) * step_y + get_global_id(2) * step_z;
+  return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint offset_first_element_in_bytes, uint stride_x,
+                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
+                                             uint step_z)
+{
+  Tensor3D tensor = {.ptr = ptr,
+                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                     .stride_x = stride_x,
+                     .stride_y = stride_y,
+                     .stride_z = stride_z};
+  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+                get_global_id(1) * step_y + get_global_id(2) * step_z;
+  return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint offset_first_element_in_bytes, uint stride_x,
+                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
+                                             uint step_z, uint stride_w, uint step_w, uint mod_size)
+{
+  Tensor4D tensor = {.ptr = ptr,
+                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                     .stride_x = stride_x,
+                     .stride_y = stride_y,
+                     .stride_z = stride_z,
+                     .stride_w = stride_w};
+
+  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+                get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
+                (get_global_id(2) / mod_size) * step_w;
+  return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+  return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+  return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+         w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
new file mode 100644
index 000000000..5f1b3f902
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+#include "helpers.h"
+
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+  float out_f32 = input / scale + offset;
+  uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+  return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                 \
+  inline VEC_DATA_TYPE(type, size)                                                                \
+      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
+  {                                                                                               \
+    VEC_DATA_TYPE(float, size)                                                                    \
+    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+    VEC_DATA_TYPE(type, size)                                                                     \
+    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
+                      VEC_DATA_TYPE(type, size));                                                 \
+    return res;                                                                                   \
+  }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                       \
+  inline VEC_DATA_TYPE(float, size)                                                       \
+      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+  {                                                                                       \
+    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
+  }
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
+  inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
+      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+  {                                                                     \
+    const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
+    const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
+    VEC_DATA_TYPE(int, size)                                            \
+    mask = (one << exponent) - one;                                     \
+    VEC_DATA_TYPE(int, size)                                            \
+    threshold = (mask >> 1) + select(zero, one, x < 0);                 \
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+  }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size)                                              \
+      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                            \
+    VEC_DATA_TYPE(int, size)                                                   \
+    overflow = a == b && a == INT_MIN;                                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    a_64 = convert_long##size(a);                                              \
+    VEC_DATA_TYPE(long, size)                                                  \
+    b_64 = convert_long##size(b);                                              \
+    VEC_DATA_TYPE(long, size)                                                  \
+    ab_64 = a_64 * b_64;                                                       \
+    /* Revert COMPMID-907 */                                                   \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask1 = 1 << 30;                                                           \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask2 = 1 - (1 << 30);                                                     \
+    VEC_DATA_TYPE(long, size)                                                  \
+    is_positive_or_zero = ab_64 >= 0;                                          \
+    VEC_DATA_TYPE(long, size)                                                  \
+    nudge = select(mask2, mask1, is_positive_or_zero);                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask = 1ll << 31;                                                          \
+    VEC_DATA_TYPE(int, size)                                                   \
+    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
+    return select(ab_x2_high32, INT_MAX, overflow);                            \
+  }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
+  inline VEC_DATA_TYPE(int, size)                                                                  \
+      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+                                                                              a)                   \
+  {                                                                                                \
+    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
+    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
+    const int k_fractional_bits = 31;                                                              \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x = a + (1 << (k_fractional_bits - 3));                                                        \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x2 = ASYMM_MULT(x, x, size);                                                                   \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x3 = ASYMM_MULT(x2, x, size);                                                                  \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
+        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
+        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
+    return constant_term +                                                                         \
+           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
+  }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
+ * bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                         \
+  inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask,  \
+                                                                VEC_DATA_TYPE(int, size) then_val, \
+                                                                VEC_DATA_TYPE(int, size) else_val) \
+  {                                                                                                \
+    return (if_mask & then_val) ^ (~if_mask & else_val);                                           \
+  }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                                    \
+    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                      \
+    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                      \
+    return select(all_zeros, all_ones, a == 0);                                        \
+  }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                                        \
+    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                          \
+    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                          \
+    return select(all_zeros, all_ones, a != 0);                                            \
+  }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
+  inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
+      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
+      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
+  {                                                                                            \
+    if (k_integer_bits > exponent)                                                             \
+    {                                                                                          \
+      const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
+      return ASYMM_SELECT_USING_MASK(                                                          \
+          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
+          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
+    }                                                                                          \
+                                                                                               \
+    return result;                                                                             \
+  }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
+  {                                                                                               \
+    const int k_fractional_bits = 31 - k_integer_bits;                                            \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    k_one_quarter = 1 << (k_fractional_bits - 2);                                                 \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    mask = k_one_quarter - 1;                                                                     \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                 \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
+        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    remainder = a_mod_quarter_minus_one_quarter - a;                                              \
+                                                                                                  \
+    result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits,        \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits,        \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits,         \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits,         \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
+                                size);                                                            \
+    result =                                                                                      \
+        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
+                                                                                                  \
+    if (k_integer_bits > 5)                                                                       \
+    {                                                                                             \
+      const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                     \
+      result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
+    }                                                                                             \
+                                                                                                  \
+    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                              \
+    return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);            \
+  }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
+  inline VEC_DATA_TYPE(int, size)                                                            \
+      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                          \
+    if (exponent < 0)                                                                        \
+    {                                                                                        \
+      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
+    }                                                                                        \
+                                                                                             \
+    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
+    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
+    int threshold = ((1 << (31 - exponent)) - 1);                                            \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    result = x << exponent;                                                                  \
+    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
+    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
+    return result;                                                                           \
+  }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size)                                                           \
+      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                                         \
+    VEC_DATA_TYPE(long, size)                                                               \
+    a64 = convert_long##size(a);                                                            \
+    VEC_DATA_TYPE(long, size)                                                               \
+    b64 = convert_long##size(b);                                                            \
+    VEC_DATA_TYPE(long, size)                                                               \
+    sum = a64 + b64;                                                                        \
+    const VEC_DATA_TYPE(long, size) one = 1;                                                \
+    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
+    VEC_DATA_TYPE(long, size)                                                               \
+    sign = select(minus_one, one, sum >= 0);                                                \
+    return convert_int##size((sum + sign) / 2);                                             \
+  }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
+  inline VEC_DATA_TYPE(int, size)                                              \
+      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                            \
+    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
+    const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
+    VEC_DATA_TYPE(int, size)                                                   \
+    half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);               \
+    const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810;                 \
+    const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;            \
+    VEC_DATA_TYPE(int, size)                                                   \
+    x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
+    for (int i = 0; i < 3; i++)                                                \
+    {                                                                          \
+      VEC_DATA_TYPE(int, size)                                                 \
+      half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);        \
+      VEC_DATA_TYPE(int, size)                                                 \
+      one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;  \
+      VEC_DATA_TYPE(int, size)                                                 \
+      tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);           \
+      x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);            \
+    }                                                                          \
+    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                 \
+  }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value
+ * accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value,             \
+                                                      int src_integer_bits, int dst_integer_bits) \
+  {                                                                                               \
+    int exponent = src_integer_bits - dst_integer_bits;                                           \
+    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
+  }
+
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) \
+  dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) \
+  DEQUANTIZE_STR(input, offset, scale, type, size)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+  asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+  ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+  ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+  asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+  asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+                           remainder, size)                                                    \
+  exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+                           remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
+  asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
+  asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+  asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+  asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+  {                                                                                               \
+    const int left_shift = shift > 0 ? shift : 0;                                                 \
+    const int right_shift = shift > 0 ? 0 : -shift;                                               \
+    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
+                                         right_shift, size);                                      \
+  }
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+  multiply_by_quantized_multiplier##size(input, qmul, shift)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(1)
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(1)
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(1)
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(1)
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(1)
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
new file mode 100644
index 000000000..014842680
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
+    defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and
+ * standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g.
+ * -DDATA_TYPE=float
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with
+ * -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value,
+ * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
+ * data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor.
+ * Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the destination tensor
+ * @param[in]  gamma_ptr                            (Optional) Pointer to the gamma tensor.
+ * Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       (Optional) Stride of the gamma tensor in X
+ * dimension (in bytes)
+ * @param[in]  gamma_step_x                         (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  (Optional) The offset of the first element in
+ * the gamma tensor
+ * @param[in]  beta_ptr                             (Optional) Pointer to the beta tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        (Optional) Stride of the beta tensor in X
+ * dimension (in bytes)
+ * @param[in]  beta_step_x                          (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the beta tensor
+ */
+__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
+#ifndef IN_PLACE
+                                        TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+#ifdef GAMMA
+                                            ,
+                                        VECTOR_DECLARATION(gamma)
+#endif // GAMMA
+#ifdef BETA
+                                            ,
+                                        VECTOR_DECLARATION(beta)
+#endif // BETA
+                                            )
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+#ifndef IN_PLACE
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
+  float sum = 0.f;
+  float sum_sq = 0.f;
+
+#if defined(NHWC)
+
+  const int ch = get_global_id(0);    // Current channel
+  const int batch = get_global_id(2); // Current batch
+  const int elements_plane = DIM_Y * DIM_Z;
+
+  for (int i_w = 0; i_w < DIM_Y; ++i_w)
+  {
+    for (int i_h = 0; i_h < DIM_Z; ++i_h)
+    {
+      float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+      sum += data;
+      sum_sq += data * data;
+    }
+  }
+
+#else // !defined(NHWC)
+  const int ch = get_global_id(2) % DIM_Z;    // Current channel
+  const int batch = get_global_id(2) / DIM_Z; // Current batch
+  const int elements_plane = DIM_X * DIM_Y;
+
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  part_sum = 0.f;
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  part_sum_sq = 0.f;
+  // Calculate partial sum
+  for (int y = 0; y < DIM_Y; ++y)
+  {
+    int x = 0;
+    for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+    {
+      // Load data
+      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+      data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+      part_sum += data;
+      part_sum_sq += data * data;
+    }
+    // Left-overs loop
+    for (; x < DIM_X; ++x)
+    {
+      DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+      part_sum.s0 += data;
+      part_sum_sq.s0 += data * data;
+    }
+  }
+// Perform reduction
+#if VEC_SIZE > 8
+  part_sum.s01234567 += part_sum.s89abcdef;
+  part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+  part_sum.s0123 += part_sum.s4567;
+  part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+  part_sum.s01 += part_sum.s23;
+  part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+  part_sum.s0 += part_sum.s1;
+  part_sum_sq.s0 += part_sum_sq.s1;
+
+  sum = (float)part_sum.s0;
+  sum_sq = (float)part_sum_sq.s0;
+
+#endif // defined(NHWC)
+
+  const float mean_float = (sum / elements_plane);
+  const DATA_TYPE mean = (DATA_TYPE)mean_float;
+  const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float);
+#if defined(GAMMA)
+  const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON);
+  const DATA_TYPE multip = (DATA_TYPE)multip_float;
+#else  // !defined(GAMMA)
+  const DATA_TYPE multip = (DATA_TYPE)0;
+#endif // defined(GAMMA)
+#if defined(BETA)
+  const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch);
+#else  // !defined(BETA)
+  const DATA_TYPE beta = 0;
+#endif // defined(BETA)
+
+#if defined(NHWC)
+
+  for (int i_w = 0; i_w < DIM_Y; ++i_w)
+  {
+    for (int i_h = 0; i_h < DIM_Z; ++i_h)
+    {
+      __global DATA_TYPE *input_address =
+          (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+      __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+      __global DATA_TYPE *output_address =
+          (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+      *(output_address) = (*(input_address)-mean) * multip + beta;
+    }
+  }
+
+#else // !defined(NHWC)
+  for (int y = 0; y < DIM_Y; ++y)
+  {
+    int x = 0;
+    for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+    {
+      __global DATA_TYPE *input_address =
+          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+      __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+      __global DATA_TYPE *output_address =
+          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+      data = VLOAD(VEC_SIZE)(0, input_address);
+
+      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+      res = (data - mean) * multip + beta;
+      VSTORE(VEC_SIZE)
+      (res, 0, output_address);
+    }
+    // Left-overs loop
+    for (; x < DIM_X; ++x)
+    {
+      __global DATA_TYPE *input_address =
+          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+      __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+      __global DATA_TYPE *output_address =
+          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+      *(output_address) = (*(input_address)-mean) * multip + beta;
+    }
+  }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
+          defined(DIM_Y) && defined(DIM_Z) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
new file mode 100644
index 000000000..3943fc4c2
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
+/** This performs to multiply input by scale_factor.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
+ * types: S8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[in]  scale_ptr                            Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in]  scale_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  scale_step_x                         scale_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  scale_offset_first_element_in_bytes  The offset of the first element in the scale
+ * tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale),
+                                    IMAGE_DECLARATION(output), float multiplier)
+{
+  // Get pixels pointer
+  Image input = CONVERT_TO_IMAGE_STRUCT(input);
+  Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+  output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+  // Load data
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  val = CONVERT(VLOAD(VEC_SIZE)(0, (__global int *)input.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+  // Create scale vector
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  vscale = *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1));
+
+  // Dequantize
+  vscale *= (DATA_TYPE)(multiplier);
+  val *= vscale;
+
+  // Store result
+  VSTORE(VEC_SIZE)
+  (val, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+  *((__global DATA_TYPE *)(output.ptr)) =
+      ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
+      *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
new file mode 100644
index 000000000..15c16f80c
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Performs a negation of input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
+ *                                               S16/S32/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
+ *                                               per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ *                                               types: same as @p input_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination
+ * image
+ *
+ */
+__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VSTORE(VEC_SIZE)
+  (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
new file mode 100644
index 000000000..c274aba62
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
+
+/** Performs the OneHot operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  indices_ptr                              Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in]  indices_stride_x                         Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in]  indices_step_x                           indices_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  indices_stride_y                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_y                           indices_stride_y * number of elements along
+ * Y processed per work item (in bytes)
+ * @param[in]  indices_stride_z                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_z                           indices_stride_z * number of elements along
+ * Z processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes    Offset of the first element in the source
+ * tensor
+ * @param[in]  on_value_ptr                             Pointer to the on_value vector. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32.
+ * @param[in]  on_value_stride_x                        Stride of the on_value vector in X dimension
+ * (in bytes)
+ * @param[in]  on_value_step_x                          on_value_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  on_value_offset_first_element_in_bytes   Offset of the first element in the on_value
+ * vector
+ * @param[in]  off_value_ptr                            Pointer to the off_value vector. Supported
+ * data types: Same as @p on_value.
+ * @param[in]  off_value_stride_x                       Stride of the off_value vector in X
+ * dimension (in bytes)
+ * @param[in]  off_value_step_x                         off_value_stride_x * number of elements
+ * along X processed per work item (in bytes)
+ * @param[in]  off_value_offset_first_element_in_bytes  Offset of the first element in the off_value
+ * vector
+ * @param[out] output_ptr                               Pointer to the destination tensor. Supported
+ * data types: same as @p on_value
+ * @param[in]  output_stride_x                          Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]  output_step_x                            output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_y                          Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                            output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_z                          Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in]  output_step_z                            output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_w                          Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in]  output_step_w                            output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes     Offset of the first element in the
+ * destination tensor
+ */
+__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
+                      VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output))
+{
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+  const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw);
+  *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw);
+  *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw);
+  *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 3
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+  *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr)
+                                                  : *((__global const DATA_TYPE *)off_value_ptr);
+#endif // AXIS
+}
+
+/** Performs the OneHot operation along the chosen axis as off_value being zero
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  indices_ptr                              Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in]  indices_stride_x                         Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in]  indices_step_x                           indices_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  indices_stride_y                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_y                           indices_stride_y * number of elements along
+ * Y processed per work item (in bytes)
+ * @param[in]  indices_stride_z                         Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  indices_step_z                           indices_stride_z * number of elements along
+ * Z processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes    Offset of the first element in the source
+ * tensor
+ * @param[in]  on_value_ptr                             Pointer to the on_value vector. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32.
+ * @param[in]  on_value_stride_x                        Stride of the on_value vector in X dimension
+ * (in bytes)
+ * @param[in]  on_value_step_x                          on_value_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in]  on_value_offset_first_element_in_bytes   Offset of the first element in the on_value
+ * vector
+ * @param[out] output_ptr                               Pointer to the destination tensor. Supported
+ * data types: same as @p on_value
+ * @param[in]  output_stride_x                          Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]  output_step_x                            output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_y                          Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                            output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_z                          Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in]  output_step_z                            output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in]  output_stride_w                          Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in]  output_step_w                            output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes     Offset of the first element in the
+ * destination tensor
+ */
+__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
+                                    TENSOR4D_DECLARATION(output))
+{
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2);
+
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z);
+
+  const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz);
+
+  if (index < 0 || index >= DEPTH)
+    return;
+
+#if AXIS == 0
+  *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 1
+  *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 2
+  *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 3
+  *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) =
+      *((__global const DATA_TYPE *)on_value_ptr);
+#endif // AXIS
+}
+
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
new file mode 100644
index 000000000..76fda9041
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of
+ *  GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to inputs
+ *  -# Multiply inputs
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and
+ *            -DIN2_OFFSET
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ *            -DRESULT_SHIFT
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types:
+ *                                               U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types:
+ *                                               U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ *                                               types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
+ *                                              bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ *                                               image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+                                    TENSOR3D_DECLARATION(out), const float scale)
+{
+  // Get pixels pointer
+  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+  // Load data
+  VEC_DATA_TYPE(int, 16)
+  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+  VEC_DATA_TYPE(int, 16)
+  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+  // Perform multiplication of two inputs
+  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+  VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
+
+  // Multiply with a multiplier smaller than 1
+  out_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+  // TODO: Apply min-max BOUND to support fuse with relu.
+  /*
+  #if defined(MIN_BOUND)
+      res = max(res, (uchar16)MIN_BOUND);
+  #endif // defined(MIN_BOUND)
+  #if defined(MAX_BOUND)
+      res = min(res, (uchar16)MAX_BOUND);
+  #endif // defined(MAX_BOUND)
+  */
+
+  // Store result
+  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
new file mode 100644
index 000000000..4ae9adb0b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+#define MIN_QUANT_VAL -127
+#define MAX_QUANT_VAL 127
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+
+/** This performs the quantization of floating point inputs to 8-bit unsigned integers.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type.
+ * e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g.
+ * -DSCALE=0.125
+ * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g.
+ * -DOFFSET=125
+ * @note Minimum value for quantized type should be given as a preprocessor argument using
+ * -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
+ * @note Maximum value for quantized type should be given as a preprocessor argument using
+ * -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
+ * types: F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: S8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[out] scale_ptr                            Pointer to the scale tensor. Supported data
+ * types: F32
+ * @param[in]  scale_stride_x                       Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  scale_step_x                         scale_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ */
+__kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale),
+                                 IMAGE_DECLARATION(output))
+{
+  // Get pixels pointer
+  Image input = CONVERT_TO_IMAGE_STRUCT(input);
+  Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+  output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+  // Load data
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+  // Create scale vector
+  const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale =
+      *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
+
+  // Quantize
+  VEC_DATA_TYPE(int, VEC_SIZE)
+  res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL);
+
+  // Store result
+  VSTORE(VEC_SIZE)
+  (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+#else  //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+  *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(
+      CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
+                      (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
+                  int),
+      MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
new file mode 100644
index 000000000..832ac1270
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform reduce max/min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                             const int axis, const int dim)
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
+
+  DATA_TYPE value =
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  for (int i = 1; i < dim; ++i)
+  {
+    indices[axis] = i;
+
+#if OP_CODE == 1 // REDUCE_MAX
+    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+
+#elif OP_CODE == 2 // REDUCE_MIN
+    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+
+#else // OP NOT SUPPORTED
+    return;
+
+#endif
+  }
+
+  *((__global DATA_TYPE *)out.ptr) = value;
+}
+
+/** Perform reduce sum/mean
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                              const int axis, const int dim)
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
+
+  DATA_TYPE sum_value = (DATA_TYPE)0;
+  for (int i = 0; i < dim; ++i)
+  {
+    indices[axis] = i;
+    sum_value += *(
+        (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  }
+
+#if OP_CODE == 3 // REDUCE_SUM
+  *((__global DATA_TYPE *)out.ptr) = sum_value;
+
+#elif OP_CODE == 4 // REDUCE_MEAN
+  *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+
+#else // OP NOT SUPPORTED
+  return;
+
+#endif
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl
new file mode 100644
index 000000000..3d5e90356
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(WIDTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using
+ * -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types:
+ * F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in
+ * position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max
+ * vector
+ */
+__kernel void scale_factor_symm8(IMAGE_DECLARATION(src), VECTOR_DECLARATION(dst))
+{
+  Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+  float4 min_value = (float4)FLT_MAX;
+  float4 max_value = (float4)-FLT_MAX;
+
+  int x = 0;
+  __global float *src_addr = (__global float *)(src.ptr);
+
+  for (; x <= (int)(WIDTH - 8); x += 8)
+  {
+    float8 value = vload8(0, (__global float *)(src_addr + x));
+
+    min_value = select(value.s0123, min_value, min_value < value.s0123);
+    min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+    max_value = select(value.s0123, max_value, max_value > value.s0123);
+    max_value = select(value.s4567, max_value, max_value > value.s4567);
+  }
+
+  for (; x < WIDTH; ++x)
+  {
+    float value = *(src_addr + x);
+
+    min_value.s0 = min(min_value.s0, value);
+    max_value.s0 = max(max_value.s0, value);
+  }
+
+  // Perform min/max reduction
+  min_value.s01 = min(min_value.s01, min_value.s23);
+  min_value.s0 = min(min_value.s0, min_value.s1);
+  max_value.s01 = max(max_value.s01, max_value.s23);
+  max_value.s0 = max(max_value.s0, max_value.s1);
+
+  // Extract scale
+  max_value.s0 = max(fabs(min_value.s0), fabs(max_value.s0)) / 127.0f;
+
+  // Store min and max
+  *((__global float *)(dst_ptr) + get_global_id(1)) = max_value.s0;
+}
+#endif // defined(WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
new file mode 100644
index 000000000..3eb1a4ce7
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf,
+                          __global int *in_ind_buf, const int n)
+{
+  int gid = get_global_id(0);
+  int lws = get_local_size(0);
+  int groups = get_num_groups(0);
+  int gws = lws * groups;
+  int iter = n / gws;
+
+  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+  for (int i = 0; i < iter; ++i)
+  {
+    int idx = i * gws + gid;
+    in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x);
+    in_ind_buf[idx] = idx;
+  }
+}
+
+__kernel void topkv2_find_first_negative(__global float *out_key_buf,
+                                         __global int *first_negative_idx, int n)
+{
+  int gid = get_global_id(0);
+
+  if (gid == n - 1)
+  {
+    // if the last item is positive, the first negative index is n.
+    if (out_key_buf[gid] > 0.f)
+      *first_negative_idx = n;
+  }
+  else if (gid == 0)
+  {
+    // if the first item is negative, set it 0.
+    if (out_key_buf[gid] < 0.f)
+      *first_negative_idx = 0;
+  }
+  else
+  {
+    // if its left is positive and it is negative, then it is the first negative item.
+    if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f)
+      *first_negative_idx = gid;
+  }
+}
+
+__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf,
+                                       __global float *in_ind_buf, __global float *out_ind_buf,
+                                       __global int *first_negative_idx, int n)
+{
+  int gid = get_global_id(0);
+
+  int num_negs = n - *first_negative_idx;
+  int in_idx;
+
+  if (gid < num_negs)
+  {
+    in_idx = n - 1 - gid;
+  }
+  else
+  {
+    in_idx = gid - num_negs;
+  }
+
+  out_key_buf[gid] = in_key_buf[in_idx];
+  out_ind_buf[gid] = in_ind_buf[in_idx];
+}
+
+__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices),
+                           __global float *out_key_buf, __global int *out_ind_buf, int n)
+{
+  int gid = get_global_id(0);
+
+  Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
+  Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+
+  int idx = n - 1 - gid;
+
+  *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
+  *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
new file mode 100644
index 000000000..460de790b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+__global inline float *get_vec_elem(Vector *vec, int idx)
+{
+  return (__global float *)(vec->ptr + idx * vec->stride_x);
+}
+
+__global inline int *get_vec_elem_int(Vector *vec, int idx)
+{
+  return (__global int *)(vec->ptr + idx * vec->stride_x);
+}
+
+// A utility function to swap two elements
+void swap(__global float *a, __global float *b)
+{
+  float t = *a;
+  *a = *b;
+  *b = t;
+}
+
+void swap_idx(__global int *a, __global int *b)
+{
+  int t = *a;
+  *a = *b;
+  *b = t;
+}
+
+/* This function is same in both iterative and recursive*/
+int partition(Vector *arr, __global int *indices, int l, int h)
+{
+  float x = *get_vec_elem(arr, h);
+  int i = (l - 1);
+
+  for (int j = l; j <= h - 1; j++)
+  {
+    if (*get_vec_elem(arr, j) >= x)
+    {
+      i++;
+      swap(get_vec_elem(arr, i), get_vec_elem(arr, j));
+      swap_idx(&indices[i], &indices[j]);
+    }
+  }
+  swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
+  swap_idx(&indices[i + 1], &indices[h]);
+  return (i + 1);
+}
+
+/* A[] --> Array to be sorted,
+   l  --> Starting index,
+   h  --> Ending index */
+void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h)
+{
+  // Create an auxiliary stack
+
+  // initialize top of stack
+  int top = -1;
+
+  // push initial values of l and h to stack
+  stack[++top] = l;
+  stack[++top] = h;
+
+  // Keep popping from stack while is not empty
+  while (top >= 0)
+  {
+    // Pop h and l
+    h = stack[top--];
+    l = stack[top--];
+
+    // Set pivot element at its correct position
+    // in sorted array
+    int p = partition(arr, indices, l, h);
+
+    // If there are elements on left side of pivot,
+    // then push left side to stack
+    if (p - 1 > l)
+    {
+      stack[++top] = l;
+      stack[++top] = p - 1;
+    }
+
+    // If there are elements on right side of pivot,
+    // then push right side to stack
+    if (p + 1 < h)
+    {
+      stack[++top] = p + 1;
+      stack[++top] = h;
+    }
+  }
+}
+
+__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values),
+                               VECTOR_DECLARATION(topk_indices), __global int *indices,
+                               __global int *temp_stack, int k, int n)
+{
+  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+  Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
+  Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
+
+  for (int i = 0; i < n; ++i)
+  {
+    indices[i] = i;
+  }
+
+  quickSortIterative(&input, indices, temp_stack, 0, n - 1);
+
+  // extract k items.
+  for (int i = 0; i < k; ++i)
+  {
+    *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
+    *get_vec_elem_int(&topk_indices, i) = indices[i];
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
new file mode 100644
index 000000000..e9d4696b4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// reference:
+// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
+// OpenCL kernel sources for the CLRadixSort class
+// the #include does not exist in OpenCL
+// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
+// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
+// if you find this software usefull you can cite the following work in your reports or articles:
+// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
+// http://hal.archives-ouvertes.fr/hal-00596730
+
+// Reference for floating point radix sort:
+// http://www.codercorner.com/RadixSortRevisited.htm
+
+// compute the histogram for each radix and each virtual processor for the pass
+__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
+                                  const int pass, __local int *loc_histo, const int n)
+{
+  int it = get_local_id(0);  // i local number of the processor
+  int ig = get_global_id(0); // global number = i + g I
+
+  int gr = get_group_id(0); // g group number
+
+  int groups = get_num_groups(0);
+  int items = get_local_size(0);
+
+  // set the local histograms to zero
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    loc_histo[ir * items + it] = 0;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // range of keys that are analyzed by the work item
+  int size = n / groups / items; // size of the sub-list
+  int start = ig * size;         // beginning of the sub-list
+
+  unsigned int key;
+  int shortkey, k;
+
+  // compute the index
+  // the computation depends on the transposition
+  for (int j = 0; j < size; j++)
+  {
+#ifdef TRANSPOSE
+    k = groups * items * j + ig;
+#else
+    k = j + start;
+#endif
+
+    key = *((__global unsigned int *)(in_key_buf + k));
+
+    // extract the group of _BITS bits of the pass
+    // the result is in the range 0.._RADIX-1
+    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+    // increment the local histogram
+    loc_histo[shortkey * items + it]++;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // copy the local histogram to the global one
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// initial transpose of the list for improving
+// coalescent memory access
+__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
+                        const int nbrow, const __global int *inperm, __global int *outperm,
+                        __local int *blockmat, __local int *blockperm, const int tilesize)
+{
+
+  int i0 = get_global_id(0) * tilesize; // first row index
+  int j = get_global_id(1);             // column index
+
+  int jloc = get_local_id(1); // local column index
+
+  // fill the cache
+  for (int iloc = 0; iloc < tilesize; iloc++)
+  {
+    int k = (i0 + iloc) * nbcol + j; // position in the matrix
+    blockmat[iloc * tilesize + jloc] = invect[k];
+#ifdef PERMUT
+    blockperm[iloc * tilesize + jloc] = inperm[k];
+#endif
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // first row index in the transpose
+  int j0 = get_group_id(1) * tilesize;
+
+  // put the cache at the good place
+  for (int iloc = 0; iloc < tilesize; iloc++)
+  {
+    int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
+    outvect[kt] = blockmat[jloc * tilesize + iloc];
+#ifdef PERMUT
+    outperm[kt] = blockperm[jloc * tilesize + iloc];
+#endif
+  }
+}
+
+// each virtual processor reorders its data using the scanned histogram
+__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
+                                __global int *d_Histograms, const int pass,
+                                __global int *indices_in, __global int *indices_out,
+                                __local int *loc_histo, const int n)
+{
+
+  int it = get_local_id(0);
+  int ig = get_global_id(0);
+
+  int gr = get_group_id(0);
+  int groups = get_num_groups(0);
+  int items = get_local_size(0);
+
+  int start = ig * (n / groups / items);
+  int size = n / groups / items;
+
+  // take the histogram in the cache
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  int newpos, shortkey, k, newpost;
+  unsigned int key;
+
+  for (int j = 0; j < size; j++)
+  {
+#ifdef TRANSPOSE
+    k = groups * items * j + ig;
+#else
+    k = j + start;
+#endif
+    float org_value = in_key[k];
+    key = *(__global unsigned int *)(in_key + k);
+    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+    newpos = loc_histo[shortkey * items + it];
+
+#ifdef TRANSPOSE
+    int ignew, jnew;
+    ignew = newpos / (n / groups / items);
+    jnew = newpos % (n / groups / items);
+    newpost = jnew * (groups * items) + ignew;
+#else
+    newpost = newpos;
+#endif
+
+    // d_outKeys[newpost]= key;  // killing line !!!
+    out_key[newpost] = org_value;
+
+#ifdef PERMUT
+    indices_out[newpost] = indices_in[k];
+#endif
+
+    newpos++;
+    loc_histo[shortkey * items + it] = newpos;
+  }
+}
+
+// perform a parallel prefix sum (a scan) on the local histograms
+// (see Blelloch 1990) each workitem worries about two memories
+// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
+                                       __global int *globsum)
+{
+  int it = get_local_id(0);
+  int ig = get_global_id(0);
+  int decale = 1;
+  int n = get_local_size(0) * 2;
+  int gr = get_group_id(0);
+
+  // load input into local memory
+  // up sweep phase
+  temp[2 * it] = histo[2 * ig];
+  temp[2 * it + 1] = histo[2 * ig + 1];
+
+  // parallel prefix sum (algorithm of Blelloch 1990)
+  for (int d = n >> 1; d > 0; d >>= 1)
+  {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (it < d)
+    {
+      int ai = decale * (2 * it + 1) - 1;
+      int bi = decale * (2 * it + 2) - 1;
+      temp[bi] += temp[ai];
+    }
+    decale *= 2;
+  }
+
+  // store the last element in the global sum vector
+  // (maybe used in the next step for constructing the global scan)
+  // clear the last element
+  if (it == 0)
+  {
+    globsum[gr] = temp[n - 1];
+    temp[n - 1] = 0;
+  }
+
+  // down sweep phase
+  for (int d = 1; d < n; d *= 2)
+  {
+    decale >>= 1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (it < d)
+    {
+      int ai = decale * (2 * it + 1) - 1;
+      int bi = decale * (2 * it + 2) - 1;
+
+      int t = temp[ai];
+      temp[ai] = temp[bi];
+      temp[bi] += t;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // write results to device memory
+
+  histo[2 * ig] = temp[2 * it];
+  histo[2 * ig + 1] = temp[2 * it + 1];
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// use the global sum for updating the local histograms
+// each work item updates two values
+__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
+{
+  int ig = get_global_id(0);
+  int gr = get_group_id(0);
+
+  int s;
+
+  s = globsum[gr];
+
+  // write results to device memory
+  histo[2 * ig] += s;
+  histo[2 * ig + 1] += s;
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
new file mode 100644
index 000000000..047004d5e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int vector_size = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output,
+                          const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
+                                      op != ReductionOperation::ARG_IDX_MIN,
+                                  "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                  "Reduction axis greater than max number of dimensions");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32,
+                                                         DataType::S64);
+  }
+  if (prev_output != nullptr && prev_output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32,
+                                                         DataType::S32, DataType::S64);
+    if (output->total_size() != 0)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
+    }
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
+                                                         ITensorInfo *prev_output,
+                                                         ITensorInfo *output, unsigned int axis,
+                                                         ReductionOperation op)
+{
+  ARM_COMPUTE_UNUSED(op);
+  // Output tensor auto initialization if not yet initialized
+  TensorShape output_shape{input->tensor_shape()};
+  output_shape.set(axis, 1);
+  DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
+  auto_init_if_empty(*output, input->clone()
+                                  ->set_tensor_shape(output_shape)
+                                  .set_data_type(output_data_type)
+                                  .reset_padding()
+                                  .set_is_resizable(true));
+
+  Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input),
+                                    Steps(vector_size));
+  bool window_changed = false;
+
+  switch (axis)
+  {
+    case 0:
+    {
+      ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input;
+      AccessWindowStatic input_access(input_tensor_access, 0, 0,
+                                      static_cast<int>(input_tensor_access->dimension(0)), 1);
+      AccessWindowHorizontal output_access(output, 0, 1);
+      window_changed = update_window_and_padding(win, input_access, output_access);
+      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    break;
+    case 1:
+    case 2:
+    case 3:
+    {
+      AccessWindowHorizontal input_access(input, 0, vector_size);
+      AccessWindowHorizontal output_access(output, 0, vector_size);
+      window_changed = update_window_and_padding(win, input_access, output_access);
+      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    break;
+    default:
+      ARM_COMPUTE_ERROR("Not supported");
+  }
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
+    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
+      _op(ReductionOperation::ARG_IDX_MAX)
+{
+}
+
+void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output,
+                                         ICLTensor *output, unsigned int axis,
+                                         ReductionOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
+                         output->info(), axis, op));
+  auto win_config = validate_and_configure_window(
+      input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
+      op);
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  _input = input;
+  _prev_output = prev_output;
+  _output = output;
+  _reduction_axis = axis;
+  _op = op;
+
+  // Set build options
+  CLBuildOptions build_opts;
+
+  build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
+  build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
+  build_opts.add_option("-DDATA_TYPE_OUTPUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_SELECT=" +
+                        get_cl_signed_type_from_element_size(input->info()->element_size()));
+
+  // Create kernel
+  cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
+  std::string kernel_axis_name;
+  switch (axis)
+  {
+    case 0:
+    {
+      const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
+      build_opts.add_option("-DWIDTH=" +
+                            support::cpp11::to_string(input_for_width->info()->dimension(0)));
+
+      kernel_axis_name = "x";
+      lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0),
+                                                          vector_size);
+    }
+    break;
+    case 1:
+      build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+      kernel_axis_name = "y";
+      break;
+    case 2:
+      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+      kernel_axis_name = "z";
+      break;
+    case 3:
+      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+      build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
+      kernel_axis_name = "w";
+      break;
+    default:
+      ARM_COMPUTE_ERROR("Not supported");
+  }
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+      "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
+
+  // Configure kernel window
+  ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+}
+
+Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output,
+                                          const ITensorInfo *output, unsigned int axis,
+                                          ReductionOperation op)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
+      output->clone().get(), axis, op)));
+  return Status{};
+}
+
+void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  switch (_reduction_axis)
+  {
+    case 0:
+    {
+      // Set out window
+      Window out_window(window);
+      out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+      // Get first input and output slices
+      Window in_slice = window.first_slice_window_2D();
+      Window out_slice = out_window.first_slice_window_2D();
+
+      // Reshape window
+      const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
+
+      // Set local sums buffer
+      unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
+      _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
+      do
+      {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        if (_prev_output != nullptr)
+        {
+          add_2D_tensor_argument(idx, _prev_output, in_slice);
+        }
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+    }
+    break;
+    case 1:
+    {
+      // Get first input and output slices
+      Window window_in{window};
+      window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1),
+                                                    _input->info()->dimension(1)));
+      Window in_slice = window_in.first_slice_window_2D();
+      Window out_slice = window.first_slice_window_2D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window_in.slide_window_slice_2D(in_slice) &&
+               window.slide_window_slice_2D(out_slice));
+    }
+    break;
+    case 2:
+    {
+      // Get first input and output slices
+      Window window_in{window};
+      window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2),
+                                                    _input->info()->dimension(2)));
+      Window in_slice = window_in.first_slice_window_3D();
+      Window out_slice = window.first_slice_window_3D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window_in.slide_window_slice_3D(in_slice) &&
+               window.slide_window_slice_3D(out_slice));
+    }
+    break;
+    case 3:
+    {
+      // Get first input and output slices
+      Window window_in{window};
+      window_in.set(3, Window::Dimension(0, 1, 1));
+      Window in_slice = window_in.first_slice_window_4D();
+      Window out_slice = window.first_slice_window_4D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, in_slice);
+        add_4D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, lws_hint());
+      } while (window_in.slide_window_slice_4D(in_slice) &&
+               window.slide_window_slice_4D(out_slice));
+    }
+    break;
+    default:
+      ARM_COMPUTE_ERROR("Not supported");
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..fbc76f5e1
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+                           const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                        ICLTensor *output, BinaryLogicalOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "binary_logical_op";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+  int op_code = 0;
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      op_code = 1;
+      break;
+    case BinaryLogicalOperation::OR:
+      op_code = 2;
+      break;
+    default:
+      throw std::runtime_error("Operation not supported, yet");
+  }
+
+  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  update_window_and_padding(win_input1, input1_access) ||
+      update_window_and_padding(win_input2, input2_access) ||
+      update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
new file mode 100644
index 000000000..6e0bcde7f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "support/StringSupport.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+                                                       DataType::S16, DataType::U16, DataType::U32,
+                                                       DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(),
+                                  "Input and output data types must be different");
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
+  // must be given)
+  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  // Get number of elements to process per iterations
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DVEC_SIZE=" +
+                        support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_OUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+
+  // Create kernel
+  const std::string kernel_name = "cast_bool";
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+
+  // Configure kernel
+  ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+  // Collapse window
+  const Window &full_window = window();
+  Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+  ICLKernel::configure_internal(collapsed_window);
+
+  // Set config_id for enabling LWS tuning
+  _config_id = kernel_name;
+  _config_id += "_";
+  _config_id += lower_string(string_from_data_type(output->info()->data_type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(output->info()->dimension(0));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+  return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..67aaf2db6
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const ICLTensor *lookups)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "embedding_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
new file mode 100644
index 000000000..3bfe3e407
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+
+inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
+                                 const ITensorInfo *output, int axis)
+{
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices,
+                                                        ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+  std::unique_ptr<ITensorInfo> output_info = input->clone();
+  output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->tensor_shape(), indices->tensor_shape(), actual_axis));
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
+
+  // Create window
+  Window win = calculate_max_window(*output, Steps());
+  output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+  return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+CLGatherExKernel::CLGatherExKernel()
+    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+{
+}
+
+void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices,
+                                 ICLTensor *output, int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), indices->info(), output->info(), axis));
+
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+  _input = input;
+  _output = output;
+  _indices = indices;
+  _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
+
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DOUTPUT_DIM_Z=" +
+                        support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+  build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+  build_opts.add_option("-DINDICES_DIM=" +
+                        support::cpp11::to_string(indices->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+  ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                                  const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                            indices->clone().get(),
+                                                            output->clone().get(), axis)
+                                  .first);
+  return Status{};
+}
+
+void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
+  unsigned int idx = 0;
+  add_4D_tensor_argument(idx, _input, window_collapsed);
+  add_3D_tensor_argument(idx, _indices, window_collapsed);
+  add_4D_tensor_argument(idx, _output, window_collapsed);
+  enqueue(queue, *this, window_collapsed, lws_hint());
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..930e7c944
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+{
+  // DO NOTHING
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Output's shape was not set");
+
+  ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) ||
+                       output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Make _lookup_indices tensor
+  _lookup_indices = support::cpp14::make_unique<CLTensor>();
+  _lookup_indices->allocator()->init(
+      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+  _lookup_indices->allocator()->allocate();
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "hashtable_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const_cast<ICLTensor *>(_lookups)->map(queue);
+  const_cast<ICLTensor *>(_keys)->map(queue);
+  _hits->map(queue);
+  _lookup_indices->map(queue);
+
+  // Set values of hits
+  const int32_t *lookups_buf =
+      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+  std::map<int32_t, size_t> key_map;
+  const size_t keys_num = _keys->info()->dimension(0);
+  for (size_t key_index = 0; key_index < keys_num; key_index++)
+  {
+    key_map[keys_buf[key_index]] = key_index;
+  }
+
+  const size_t lookups_num = _lookups->info()->dimension(0);
+  for (size_t i = 0; i < lookups_num; ++i)
+  {
+    const auto lookup_value = lookups_buf[i];
+    const auto it = key_map.find(lookup_value);
+    if (it != key_map.end())
+    {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+      if (it->second >= lookups_num)
+        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+      hits_buf[i] = static_cast<uint8_t>(1);
+    }
+    else
+    {
+      lookup_indices_buf[i] = -1;
+      hits_buf[i] = static_cast<uint8_t>(0);
+    }
+  }
+
+  const_cast<ICLTensor *>(_lookups)->unmap(queue);
+  const_cast<ICLTensor *>(_keys)->unmap(queue);
+  _hits->unmap(queue);
+  _lookup_indices->unmap(queue);
+
+  Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, win);
+    add_4D_tensor_argument(idx, _output, win);
+    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+    enqueue(queue, *this, win);
+  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..61c14d271
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_UNUSED(gamma);
+  ARM_COMPUTE_UNUSED(beta);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+
+  if (output != nullptr && output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                    "Input and output have different number of channels");
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // We handle the planes manually
+  Window win = calculate_max_window(*input, Steps(1));
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+  // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
+      _run_in_place(false)
+{
+}
+
+void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output,
+                                                     ICLTensor *gamma, ICLTensor *beta,
+                                                     float epsilon)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _input = input;
+  _output = output == nullptr ? input : output;
+  _gamma = gamma;
+  _beta = beta;
+  _epsilon = epsilon;
+
+  _run_in_place = (output == nullptr) || (output == input);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(),
+                                                gamma ? gamma->info() : nullptr,
+                                                beta ? beta->info() : nullptr, epsilon));
+  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DVEC_SIZE=" +
+                        support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
+  build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
+  build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+  build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
+  build_opts.add_option_if(gamma, "-DGAMMA");
+  build_opts.add_option_if(beta, "-DBETA");
+  build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+  build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+  ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+                                                      const ITensorInfo *output,
+                                                      const ITensorInfo *gamma,
+                                                      const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+  return Status{};
+}
+
+void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window collapsed_window = window.collapse(window, Window::DimZ);
+
+  // We will process the planes together
+  if (_input->info()->data_layout() == DataLayout::NCHW)
+  {
+    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+    collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+  }
+  else
+  {
+    collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1));
+  }
+
+  Window vec_window;
+  vec_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  unsigned int idx = 0;
+  add_4D_tensor_argument(idx, _input, collapsed_window);
+  if (!_run_in_place)
+  {
+    add_4D_tensor_argument(idx, _output, collapsed_window);
+  }
+  if (_gamma)
+  {
+    add_1D_tensor_argument(idx, _gamma, vec_window);
+  }
+  if (_beta)
+  {
+    add_1D_tensor_argument(idx, _beta, vec_window);
+  }
+
+  enqueue(queue, *this, collapsed_window, lws_hint());
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
new file mode 100644
index 000000000..6b27c9917
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+  // Checks performed when output is configured
+  if ((output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+                                                         ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+
+  // CLMultiplyScaleFactorKernel doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+  return std::make_tuple(Status{}, win);
+}
+} // namespace
+
+CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+{
+}
+
+void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor,
+                                            ICLTensor *output, float multiplier)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+  _multiplier = multiplier;
+
+  const int vec_size_x = 16 / output->info()->element_size();
+  const int output_width_x = output->info()->tensor_shape().x();
+  const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+  // Create and update the window (if needed)
+  Window win = calculate_max_window(*output->info());
+  if (multi_access_x)
+  {
+    win.set(Window::DimX,
+            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
+                              vec_size_x));
+  }
+  ICLKernel::configure_internal(win);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option_if(
+      multi_access_x, "-DLAST_ACCESSED_X=" +
+                          support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
+}
+
+Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
+                                             const ITensorInfo *scale_factor,
+                                             const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+  return Status{};
+}
+
+void CLMultiplyScaleFactorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = window_collapsed.first_slice_window_2D();
+
+  // Set scale_factor window
+  Window win_scale = calculate_max_window(*_scale_factor->info(), Steps());
+
+  do
+  {
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input, slice);
+    add_1D_tensor_argument(idx, _scale_factor, win_scale);
+    add_2D_tensor_argument(idx, _output, slice);
+    _kernel.setArg<float>(idx++, _multiplier);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window_collapsed.slide_window_slice_2D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..643c8b110
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+  // Configure window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
new file mode 100644
index 000000000..35d70d689
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "support/StringSupport.h"
+#include <string>
+namespace arm_compute
+{
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value,
+                                 const ITensorInfo *output, int depth, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output);
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions());
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
+                                                       DataType::U16, DataType::S16, DataType::F16,
+                                                       DataType::U32, DataType::S32, DataType::F32);
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
+        indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
+                                                        const ITensorInfo *on_value,
+                                                        ITensorInfo *output, int depth, int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices);
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+  // Output auto initialization if not yet initialized
+  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
+      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+  auto_init_if_empty((*output), output_shape, 1, on_value->data_type());
+  // Create window
+  Window win = calculate_max_window(*output, Steps());
+  output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+CLOneHotKernel::CLOneHotKernel()
+    : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
+      _is_off_value_memset(false)
+{
+}
+void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
+                               const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
+{
+  _is_off_value_memset = false;
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info());
+  ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+  _off_value = off_value;
+  configure_common(indices, on_value, output, depth, axis);
+}
+void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
+                               ICLTensor *output, int depth, int axis)
+{
+  _is_off_value_memset = true;
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output);
+  configure_common(indices, on_value, output, depth, axis);
+}
+void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value,
+                                      ICLTensor *output, int depth, int axis)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  if (_is_off_value_memset)
+  {
+    // Replace window with calculated by infices info
+    win_config.second = calculate_max_window(*indices->info(), Steps());
+  }
+  _indices = indices;
+  _on_value = on_value;
+  _output = output;
+  const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(
+                                             data_size_from_type(on_value->info()->data_type())));
+  build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis));
+  build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
+  build_opts.add_option("-DOUTPUT_DIM_Z=" +
+                        support::cpp11::to_string(output->info()->dimension(2)));
+  // Create kernel
+  const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot";
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+  ICLKernel::configure_internal(win_config.second);
+}
+Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                                const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                                int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value);
+  ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
+                                                            on_value->clone().get(),
+                                                            output->clone().get(), depth, axis)
+                                  .first);
+  return Status{};
+}
+Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                                const ITensorInfo *output, int depth, int axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
+                                                            on_value->clone().get(),
+                                                            output->clone().get(), depth, axis)
+                                  .first);
+  return Status{};
+}
+void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  unsigned int idx = 0;
+  add_3D_tensor_argument(idx, _indices, window_collapsed);
+  add_1D_tensor_argument(idx, _on_value, window_collapsed);
+  if (!_is_off_value_memset)
+  {
+    add_1D_tensor_argument(idx, _off_value, window_collapsed);
+  }
+  add_4D_tensor_argument(idx, _output, window_collapsed);
+  enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
new file mode 100644
index 000000000..1a7a18cfa
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, scale_factor);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  // Output must always be initialized
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  const int vec_size_x = 16 / input->element_size();
+  const int input_width_x = input->tensor_shape().x();
+  const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+  if (multi_access_x)
+  {
+    win.set(Window::DimX,
+            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
+                              vec_size_x));
+  }
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
+{
+}
+
+void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor,
+                                              ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+
+  const int vec_size_x = 16 / input->info()->element_size();
+  const int input_width_x = input->info()->tensor_shape().x();
+  const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_OUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option_if(
+      multi_access_x, "-DLAST_ACCESSED_X=" +
+                          support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
+}
+
+Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
+                                               const ITensorInfo *scale_factor,
+                                               const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+  return Status{};
+}
+
+void CLQuantizationSymmetricKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  // Support only 2D
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = window_collapsed.first_slice_window_2D();
+
+  do
+  {
+    Window scale_slice = slice.shift_dimensions(1);
+
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input, slice);
+    add_1D_tensor_argument(idx, _scale_factor, scale_slice);
+    add_2D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window_collapsed.slide_window_slice_2D(slice));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..3fbebf25a
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                          ReductionOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32, DataType::S32);
+  if (op == ReductionOperation::SUM)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+                                    "Not support QASYMM8, yet");
+  }
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match axis");
+
+  return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const uint32_t axis, ReductionOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  _input = input;
+  _output = output;
+  _axis = axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+  // Construct kernel name
+  std::string kernel_name;
+  int op_code = 0;
+  if (op == ReductionOperation::MAX)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 1;
+  }
+  else if (op == ReductionOperation::MIN)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 2;
+  }
+  else if (op == ReductionOperation::SUM)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 3;
+  }
+  else if (op == ReductionOperation::MEAN_SUM)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 4;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const uint32_t axis, ReductionOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+  return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+  // Support dimensions up to 4
+  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+  // of input and output are the same
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+  idx = 0;
+  add_4D_tensor_argument(idx, _input, slice_in);
+  add_4D_tensor_argument(idx, _output, slice_out);
+  enqueue(queue, *this, slice_out, lws_hint());
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
new file mode 100644
index 000000000..8d8853c81
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/StringSupport.h"
+
+#include <climits>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    TensorShape output_shape = TensorShape{input->dimension(1)};
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  TensorShape output_shape = TensorShape{input->dimension(1)};
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, output_shape, 1, input->data_type());
+
+  const unsigned int num_elems_processed_per_iteration = 1;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowStatic output_access(output, 0, 0, output->dimension(0), 1);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+CLScaleFactorSymm8Kernel::CLScaleFactorSymm8Kernel() : _input(nullptr), _output(nullptr) {}
+
+void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  _input = input;
+  _output = output;
+
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
+
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+
+  return Status{};
+}
+
+void CLScaleFactorSymm8Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = window_collapsed.first_slice_window_2D();
+  slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  do
+  {
+    Window output_slice = slice.shift_dimensions(1);
+
+    unsigned int idx = 0;
+    // Set inputs
+    add_2D_tensor_argument(idx, _input, slice);
+    add_1D_tensor_argument(idx, _output, output_slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window_collapsed.slide_window_slice_2D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
new file mode 100644
index 000000000..151d45e8d
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+namespace arm_compute
+{
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
+
+void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+                               cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  _input = input;
+  _topk_values = topk_values;
+  _topk_indices = topk_indices;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
+
+  unsigned int idx = 3 * num_arguments_per_1D_tensor();
+  _kernel.setArg(idx++, *indices);
+  _kernel.setArg(idx++, *temp_stack);
+  _kernel.setArg<cl_int>(idx++, k);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, 1, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _input, window);
+  add_1D_tensor_argument(idx, _topk_values, window);
+  add_1D_tensor_argument(idx, _topk_indices, window);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
+
+void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
+                             int n)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  _input = input;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
+
+  unsigned int idx = num_arguments_per_1D_tensor();
+  _kernel.setArg(idx++, *in_key_buf);
+  _kernel.setArg(idx++, *in_ind_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _input, window);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This kernel makes a histogram of radix for each work item.
+CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
+
+void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
+
+  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
+
+  unsigned int idx = 1;
+  _kernel.setArg(idx++, *hist_buf);
+
+  idx = 3;
+  _kernel.setArg(idx++, loc_histo_size, nullptr);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  _kernel.setArg(0, *_in_key_buf);
+  _kernel.setArg<cl_int>(2, _pass);
+
+  cl::NDRange lws = cl::NDRange(_ITEMS, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
+
+void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+  int temp_size =
+      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *hist_buf);
+  _kernel.setArg(idx++, temp_size, nullptr);
+  _kernel.setArg(idx++, *glob_sum_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
+
+void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
+                                               int bits)
+{
+  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+  int temp_size =
+      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *glob_sum_buf);
+  _kernel.setArg(idx++, temp_size, nullptr);
+  _kernel.setArg(idx++, *temp_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
+
+void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *hist_buf);
+  _kernel.setArg(idx++, *glob_sum_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortReorder::CLRadixSortReorder()
+    : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
+      _out_ind_buf(nullptr)
+{
+}
+
+void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
+
+  unsigned int idx = 2;
+  _kernel.setArg(idx++, *hist_buf);
+
+  idx = 6;
+  _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
+  cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
+
+  _kernel.setArg(0, *_in_key_buf);
+  _kernel.setArg(1, *_out_key_buf);
+  _kernel.setArg<cl_int>(3, _pass);
+  _kernel.setArg(4, *_in_ind_buf);
+  _kernel.setArg(5, *_out_ind_buf);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
+
+void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
+
+  unsigned int idx = 1;
+  _kernel.setArg(idx++, *first_negative_idx_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *_out_key_buf);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
+    : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
+
+  unsigned int idx = 4;
+  _kernel.setArg(idx++, *first_negative_idx_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *_in_key_buf);
+  _kernel.setArg(idx++, *_out_key_buf);
+  _kernel.setArg(idx++, *_in_ind_buf);
+  _kernel.setArg(idx++, *_out_ind_buf);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Store::CLTopKV2Store()
+    : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
+{
+  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(k == 0);
+  ARM_COMPUTE_ERROR_ON(k > n);
+
+  _values = values;
+  _indices = indices;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
+
+  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, k, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
+{
+  _out_key_buf = out_key_buf;
+  _out_ind_buf = out_ind_buf;
+}
+
+void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _values, window);
+  add_1D_tensor_argument(idx, _indices, window);
+  _kernel.setArg(idx++, *_out_key_buf);
+  _kernel.setArg(idx++, *_out_ind_buf);
+
+  enqueue(queue, *this, window);
+}
+
+} // namespace arm_compute
+#endif // Disable GPU implementation
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
new file mode 100644
index 000000000..dfe5d59b0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+
+#include <algorithm>
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+namespace
+{
+
+using namespace arm_compute;
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op_templ(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
+                          OutputScalarType *, const bool),
+    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
+                     OutputScalarType *))
+{
+  // Create input windows
+  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+  // Clear X Dimension on execution window as we handle manually
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+  if (is_broadcast_across_x)
+  {
+    const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+    Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+    Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+    const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+    const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+    // Clear X Dimension on execution window as we handle manually
+    non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator broadcast_input(broadcast_tensor, broadcast_win);
+    Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+    Iterator output(out, win);
+
+    execute_window_loop(win,
+                        [&](const Coordinates &) {
+                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                          const auto non_broadcast_input_ptr =
+                              reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                          const InputScalarType broadcast_value =
+                              *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+                          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
+                                                    non_broadcast_input_ptr, broadcast_value,
+                                                    output_ptr, !is_broadcast_input_2);
+                          for (; x < window_end_x; ++x)
+                          {
+                            const auto a = *(non_broadcast_input_ptr + x);
+                            *(output_ptr + x) =
+                                (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                               !is_broadcast_input_2 ? a : broadcast_value);
+                          }
+                        },
+                        broadcast_input, non_broadcast_input, output);
+  }
+  else
+  {
+    // Clear X Dimension on execution window as we handle manually
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    execute_window_loop(win,
+                        [&](const Coordinates &) {
+                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                          const auto input1_ptr =
+                              reinterpret_cast<const InputScalarType *>(input1.ptr());
+                          const auto input2_ptr =
+                              reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
+                                               input1_ptr, input2_ptr, output_ptr);
+                          for (; x < window_end_x; ++x)
+                          {
+                            const auto a = *(input1_ptr + x);
+                            const auto b = *(input2_ptr + x);
+                            *(output_ptr + x) = (*scalar_func)(a, b);
+                          }
+                        },
+                        input1, input2, output);
+  }
+}
+
+} // namespace
+
+namespace arm_compute
+{
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    float (*scalar_func)(const float &, const float &),
+                    int (*broadcast_func)(int, int, int, const float *, const float &, float *,
+                                          const bool),
+                    int (*neon_func)(int, int, int, const float *, const float *, float *))
+{
+  elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func,
+                                                  broadcast_func, neon_func);
+}
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
+                    int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
+                                          uint8_t *, const bool),
+                    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *))
+{
+  elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func,
+                                                     broadcast_func, neon_func);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
new file mode 100644
index 000000000..32d7d6237
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation op, typename ScalarType>
+inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+  auto res = ScalarType(0);
+
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      res = a & b;
+      break;
+    case BinaryLogicalOperation::OR:
+      res = a | b;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <BinaryLogicalOperation op, typename VectorType>
+inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b)
+{
+  VectorType res = {0, 0, 0, 0};
+
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      res = wrapper::vand(a, b);
+      break;
+    case BinaryLogicalOperation::OR:
+      res = wrapper::vorr(a, b);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <BinaryLogicalOperation op>
+inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
+{
+  uint8x16x4_t out = {{
+      elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
+      elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
+  }};
+  return out;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_logic_op_broadcast(const VectorType &a,
+                                                 const ScalarType &broadcast_value,
+                                                 const bool reorder)
+{
+  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+  return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                     const ScalarType *input1_ptr, const ScalarType *input2_ptr,
+                                     ScalarType *output_ptr)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq(input1_ptr + x);
+    const auto b = wrapper::vloadq(input2_ptr + x);
+    wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b));
+  }
+  return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x,
+                                               int window_step_x,
+                                               const ScalarType *non_broadcast_input_ptr,
+                                               const ScalarType &broadcast_value,
+                                               ScalarType *output_ptr, const bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+    wrapper::vstore(output_ptr + x,
+                    elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder));
+  }
+  return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+                          const Window &window)
+{
+  elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>,
+                 &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>,
+                 &elementwise_logic_op_loop<op, ScalarType, VectorType>);
+}
+
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
+    const ITensor *input1, const ITensor *input2, ITensor *output,
+    std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+{
+  std::string function_to_call("op_");
+  function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(output->info()->data_type());
+
+  auto it = map_function.find(function_to_call);
+
+  if (it != map_function.end())
+  {
+    auto func = it->second;
+    return [func](const ITensor *input1, const ITensor *input2, ITensor *output,
+                  const Window &window) { func(input1, input2, output, window); };
+  }
+  return nullptr;
+}
+
+template <BinaryLogicalOperation op>
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
+      {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+      {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+
+  return configure_func(input1, input2, output, map_function);
+}
+
+void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1,
+                                               const ITensor *input2, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+  configure_common(input1, input2, output);
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output);
+      break;
+    case BinaryLogicalOperation::OR:
+      _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1,
+                                                          const ITensorInfo &input2,
+                                                          const ITensorInfo &output)
+{
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+  }
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+  const TensorShape out_shape =
+      TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
+                                                const ITensorInfo *input1,
+                                                const ITensorInfo *input2,
+                                                const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+  return Status{};
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
new file mode 100644
index 000000000..12017e543
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/SaturateCast.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+                                                       DataType::S16, DataType::U16, DataType::F16,
+                                                       DataType::U32, DataType::S32, DataType::F32);
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {}
+
+void NECastBoolKernel::configure(const ITensor *input, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
+  // must be given)
+  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+  _input = input;
+  _output = output;
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICPPKernel::configure(win);
+}
+
+Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+  return Status{};
+}
+
+void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
+  ARM_COMPUTE_ERROR_ON(_input == _output);
+
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const int window_step_x = 16;
+
+  Window win{window};
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(_input, win);
+  Iterator output(_output, win);
+
+  const uint8_t true_val = 1;
+  const uint8x8_t mask_bool = vdup_n_u8(true_val);
+
+  switch (_output->info()->data_type())
+  {
+    case DataType::S8:
+    {
+      /* Conversion U8 -> S8 */
+      execute_window_loop(win,
+                          [&](const Coordinates &) {
+                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+                            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+                              vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8(
+                                                           texels_u8, vdupq_n_u8(true_val))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                              *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
+                            }
+                          },
+                          input, output);
+      break;
+    }
+    case DataType::S16:
+    {
+      /* Up-conversion U8 -> S16 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+              vst1q_s16(output_ptr + x, texels.val[0]);
+              vst1q_s16(output_ptr + x + 8, texels.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+    case DataType::S32:
+    {
+      /* Up-conversion U8 -> S32 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+              vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+              vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+              vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+              vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+    case DataType::F32:
+    {
+      /* Up-conversion U8 -> F32 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+              vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+              vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+              vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+              vst1q_f32(output_ptr + x + 12,
+                        vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+              *(output_ptr + x) = static_cast<float>(in);
+            }
+          },
+          input, output);
+      break;
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+    {
+      /* Up-conversion U8 -> F16 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const int16x8x2_t texels = {
+                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+              vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
+              vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::U8:
+    {
+      /* Conversion U8 -> S8 */
+      execute_window_loop(win,
+                          [&](const Coordinates &) {
+                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+                            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                            {
+                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+                              vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                              *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
+                            }
+                          },
+                          input, output);
+      break;
+    }
+    case DataType::U16:
+    {
+      /* Up-conversion U8 -> U16 */
+      execute_window_loop(
+          win,
+          [&](const Coordinates &) {
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+              const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
+                                            vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
+
+              vst1q_u16(output_ptr + x, texels.val[0]);
+              vst1q_u16(output_ptr + x + 8, texels.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+              *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
+            }
+          },
+          input, output);
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Output data type not supported");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..091d38c56
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
+    : _input(nullptr), _lookups(nullptr), _output(nullptr)
+{
+}
+
+void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output,
+                                        const ITensor *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Auto initialize output if not initialized
+  auto out_shape = input->info()->tensor_shape();
+  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions());
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
+                                         const arm_compute::ITensorInfo *output,
+                                         const arm_compute::ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+    {
+      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+  }
+
+  return Status{};
+}
+
+void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+
+  Window output_window{window};
+  output_window.set(Window::DimX,
+                    Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                      _input->info()->dimension(0)));
+
+  Window out_slice = output_window.first_slice_window_4D();
+  do
+  {
+    Iterator output_it(_output, out_slice);
+
+    execute_window_loop(out_slice,
+                        [&](const Coordinates &id) {
+                          const int32_t lookup = *reinterpret_cast<int32_t *>(
+                              _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+                          Coordinates input_id{id};
+                          input_id.set(lookup_dim, lookup);
+                          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                                 _output->info()->dimension(0) * _output->info()->element_size());
+                        },
+                        output_it);
+
+  } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
new file mode 100644
index 000000000..93963a504
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U> void validate_indices(const ITensor *indices)
+{
+  for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+  {
+    ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+  }
+}
+
+} // namespace
+
+NEGatherKernelEx::NEGatherKernelEx()
+    : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
+{
+}
+
+template <typename U>
+inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Iterator output_it(_output, window);
+  execute_window_loop(
+      window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices_rank);
+
+        U new_index;
+        switch (_indices_rank)
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+            break;
+          case 2:
+            new_index =
+                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+            break;
+          case 3:
+            new_index = *(
+                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(0, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+template <typename U>
+void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Window output_window{window};
+  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator output_it(_output, output_window);
+  execute_window_loop(
+      output_window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices_rank, _axis);
+
+        U new_index;
+        switch (_indices_rank)
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+            break;
+          case 2:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+            break;
+          case 3:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(_axis, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id),
+                    _input->info()->dimension(0) * _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
+                                 int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+  _indices_rank = indices->info()->num_dimensions();
+
+  if (_axis < 0)
+  {
+    _axis += input->info()->num_dimensions();
+  }
+  ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+  if (0 == _axis)
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_0_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_0_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  else
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_n_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_n_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  // Output auto initialization if not yet initialized
+  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+  // Create window
+  Window win = calculate_max_window(*output->info(), Steps());
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  INEKernel::configure(win);
+}
+
+Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                                  const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+
+  if (axis < 0)
+  {
+    axis += input->num_dimensions();
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  return Status{};
+}
+
+void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+  (this->*_func)(window, info);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
new file mode 100644
index 000000000..30787c0a4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <unordered_map>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr size_t NOT_HIT = 0xFFFFFFFF;
+} // namespace
+
+NEHashtableLookupKernel::NEHashtableLookupKernel()
+    : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+{
+}
+
+void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys,
+                                        const ITensor *input, ITensor *output, ITensor *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Auto initialize output if not initialized
+  auto out_shape{input->info()->tensor_shape()};
+  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false);
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  // Auto initialize hits if not initialized
+  auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8);
+
+  INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1));
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+    {
+      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+  }
+
+  // Validate in case of configured hits
+  if (hits->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1));
+    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0));
+    ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+  }
+
+  return Status{};
+}
+
+void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+  const int const_0 = _output->info()->data_type() == DataType::QASYMM8
+                          ? _output->info()->quantization_info().uniform().offset
+                          : 0;
+
+  std::unordered_map<int32_t, size_t> key_index_map;
+  for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
+  {
+    const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n}));
+    key_index_map[key] = n;
+  }
+  std::vector<size_t> lookup_indices;
+  for (size_t k = 0; k < _lookups->info()->dimension(0); ++k)
+  {
+    const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k}));
+    const auto it = key_index_map.find(key);
+    if (it == key_index_map.end())
+    {
+      lookup_indices.emplace_back(NOT_HIT);
+      *_hits->ptr_to_element({k}) = 0;
+    }
+    else
+    {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+      if (it->second >= _keys->info()->dimension(0))
+        ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+      lookup_indices.emplace_back(it->second);
+      *_hits->ptr_to_element({k}) = 1;
+    }
+  }
+
+  Window output_window{window};
+  output_window.set(Window::DimX,
+                    Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                      _input->info()->dimension(0)));
+
+  Window out_slice = output_window.first_slice_window_4D();
+  do
+  {
+    Iterator output_it(_output, out_slice);
+
+    execute_window_loop(out_slice,
+                        [&](const Coordinates &id) {
+                          const auto lookup = lookup_indices.at(id[lookup_dim]);
+                          if (lookup == NOT_HIT)
+                          {
+                            memset(output_it.ptr(), const_0,
+                                   _output->info()->dimension(0) * _output->info()->element_size());
+                          }
+                          else
+                          {
+                            Coordinates input_id{id};
+                            input_id.set(lookup_dim, lookup);
+                            memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                                   _output->info()->dimension(0) * _output->info()->element_size());
+                          }
+
+                        },
+                        output_it);
+
+  } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..49adf1462
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                                 float epsilon, const Window &window)
+{
+  /** NEON vector tag type. */
+  using ExactTagType =
+      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+  // Clear X/Y dimensions on execution window as we handle the planes manually
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+  win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  constexpr int window_step_x = 16 / sizeof(T);
+  const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+  const auto channel_idx =
+      get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+  Iterator input_it(input, win);
+  execute_window_loop(
+      win,
+      [&](const Coordinates &id) {
+        Window win_plane = window;
+        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+        Iterator input_plane_it(input, win_plane);
+        Iterator output_plane_it(output, win_plane);
+
+        auto sum_h_w = static_cast<T>(0.f);
+        auto sum_squares_h_w = static_cast<T>(0.f);
+
+        execute_window_loop(
+            win_plane,
+            [&](const Coordinates &) {
+              const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+              auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+              auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+              // Compute S elements per iteration
+              int x = window.x().start();
+              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+              {
+                auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+                vec_sum_squares_h_w =
+                    wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+              }
+
+              auto vec2_sum_h_w =
+                  wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+              auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+                                                         wrapper::vgetlow(vec_sum_squares_h_w));
+              for (int i = 0; i < window_step_x / 4; ++i)
+              {
+                vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+              }
+              sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+              sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+              // Compute left-over elements
+              for (; x < window.x().end(); ++x)
+              {
+                const auto value = *(input_ptr + x);
+                sum_h_w += value;
+                sum_squares_h_w += value * value;
+              }
+            },
+            input_plane_it, output_plane_it);
+
+        const auto mean_h_w = sum_h_w / elements_plane;
+        const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+        auto gamma_val = 1.0f;
+        if (gamma != nullptr)
+        {
+          gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+        }
+        const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+        const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+        auto beta_val = 0.0f;
+        if (beta != nullptr)
+        {
+          beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+        }
+        const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+        execute_window_loop(
+            win_plane,
+            [&](const Coordinates &) {
+              auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+              auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+              // Compute S elements per iteration
+              int x = window.x().start();
+              auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+              {
+                vec_val = wrapper::vloadq(input_ptr + x);
+                vec_val = wrapper::vadd(
+                    wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+                wrapper::vstore(output_ptr + x, vec_val);
+              }
+
+              // Compute left-over elements
+              for (; x < window.x().end(); ++x)
+              {
+                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+              }
+            },
+            input_plane_it, output_plane_it);
+      },
+      input_it);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                  "NHWC data layout is not supported by the kernel directly");
+
+  if (output != nullptr && output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                    "Input and output have different number of channels");
+  }
+
+  if (gamma != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                        gamma->dimension(0),
+                                    "Gamma's size must be the same as size of input's channel");
+  }
+
+  if (beta != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                        beta->dimension(0),
+                                    "Beta's size must be the same as size of input's channel");
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // We handle the planes manually
+  Window win = calculate_max_window(*input, Steps(1));
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+  // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+      _epsilon(1e-12)
+{
+}
+
+void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output,
+                                                     ITensor *gamma, ITensor *beta, float epsilon)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _input = input;
+  _output = output == nullptr ? input : output;
+  _gamma = gamma;
+  _beta = beta;
+  _epsilon = epsilon;
+
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    _func = &instance_normalization_nchw<float>;
+  }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else if (_input->info()->data_type() == DataType::F16)
+  {
+    _func = &instance_normalization_nchw<float16_t>;
+  }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported data type");
+  }
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+                                                      const ITensorInfo *output,
+                                                      const ITensorInfo *gamma,
+                                                      const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+  return Status{};
+}
+
+void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+  (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
new file mode 100644
index 000000000..b92130cec
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  // Checks performed when output is configured
+  if ((output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+inline int32x4x4_t load_value(const int32_t *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+  ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+  wrapper::vstore(ptr, v.val[0]);
+  wrapper::vstore(ptr + 4, v.val[1]);
+  wrapper::vstore(ptr + 8, v.val[2]);
+  wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+  wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+  wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
+{
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+  const float32x4x4_t ret = {{
+      vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+      vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+  }};
+  return ret;
+}
+} // namespace
+
+NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+{
+}
+
+void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor,
+                                            ITensor *output, float multiplier)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+  _multiplier = multiplier;
+
+  // Configure kernel window
+  Window win_config = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  INEKernel::configure(win_config);
+}
+
+Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
+                                             const ITensorInfo *scale_factor,
+                                             const ITensorInfo *output, float multiplier)
+{
+  ARM_COMPUTE_UNUSED(multiplier);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+
+  return Status{};
+}
+
+template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window)
+{
+  constexpr auto window_step = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  // Support Only 2D input
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &id) {
+        auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+        scale *= _multiplier;
+
+        const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+        auto output_ptr = reinterpret_cast<T *>(output.ptr());
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step); x += window_step)
+        {
+          store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+        }
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          output_ptr[x] = input_ptr[x] * scale;
+        }
+      },
+      input, output);
+}
+
+void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_output->info()->data_type())
+  {
+    case DataType::F32:
+      NEMultiplyScaleFactorKernel::multiply<float>(window);
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      NEMultiplyScaleFactorKernel::multiply<float16_t>(window);
+      break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
new file mode 100644
index 000000000..0a11eb509
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+namespace arm_compute
+{
+namespace
+{
+/** Validate the depth
+ *
+ * Validate that depth are not negative
+ *
+ * @param[in] depth Depth tensor.
+ * @param[in] output Output tensor.
+ * @param[in] axis Axis of depth.
+ */
+template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis)
+{
+  ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0);
+  ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) !=
+                       *(reinterpret_cast<U *>(depth->buffer())));
+}
+
+Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth,
+                          const ITensorInfo *on_value, const ITensorInfo *off_value,
+                          const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
+  const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis ||
+                              actual_axis >= static_cast<int>(output->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
+                                                       DataType::U16, DataType::S16, DataType::F16,
+                                                       DataType::U32, DataType::S32, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
+  }
+
+  return Status{};
+}
+
+template <typename U, typename Enable = void> bool isOnValue(U) { return true; }
+
+template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0>
+bool isOnValue(U index, U depth)
+{
+  return index >= 0 && index < depth;
+}
+} // namespace
+
+NEOneHotKernel::NEOneHotKernel()
+    : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1},
+      _output{nullptr}, _func{}
+{
+}
+
+template <typename U>
+void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  // Validate that the depth are not negative
+  validate_depth<U>(_depth, _output, _axis);
+  Window output_window{window};
+  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+  Iterator output_it(_output, output_window);
+  const U off_value = *reinterpret_cast<U *>(_off_value->buffer());
+  execute_window_loop(
+      output_window,
+      [&](const Coordinates &id) {
+        std::fill_n(output_it.ptr(),
+                    _output->info()->dimension(0) * _output->info()->element_size(), off_value);
+        Coordinates indices_id(id);
+        indices_id.remove(0);
+        const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+        {
+          Coordinates onehot_id(id);
+          onehot_id.set(0, new_index);
+          std::copy_n(_on_value->buffer(), _output->info()->element_size(),
+                      _output->ptr_to_element(onehot_id));
+        }
+      },
+      output_it);
+}
+
+template <typename U>
+inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  // Validate that the indices are not negative
+  validate_depth<U>(_depth, _output, _axis);
+  Iterator output_it(_output, window);
+  execute_window_loop(window,
+                      [&](const Coordinates &id) {
+                        Coordinates indices_id(id);
+                        indices_id.remove(_axis);
+                        const U new_index =
+                            *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+                        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+                        {
+                          Coordinates onehot_id(id);
+                          onehot_id.set(_axis, new_index);
+                          std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
+                                                                             : _off_value->buffer(),
+                                      _output->info()->element_size(), output_it.ptr());
+                        }
+                      },
+                      output_it);
+}
+
+void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth,
+                               const ITensor *on_value, const ITensor *off_value, ITensor *output,
+                               int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
+  ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(),
+                                                off_value->info(), output->info(), axis));
+  _indices = indices;
+  _depth = depth;
+  _on_value = on_value;
+  _off_value = off_value;
+  _output = output;
+  _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
+  if (0 == _axis)
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEOneHotKernel::onehot_0_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEOneHotKernel::onehot_0_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  else
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEOneHotKernel::onehot_n_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEOneHotKernel::onehot_n_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  // Create window
+  Window win = calculate_max_window(*output->info(), Steps());
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+  INEKernel::configure(win);
+}
+
+Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                                const ITensorInfo *on_value, const ITensorInfo *off_value,
+                                const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_arguments(indices, depth, on_value, off_value, output, axis));
+  return Status{};
+}
+
+void NEOneHotKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+  (this->*_func)(window, info);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
new file mode 100644
index 000000000..5841f1d69
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *scale_factor)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  return Status{};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+inline float32x4_t round(const float32x4_t &fv)
+{
+  const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f);
+  const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
+  // If value < 0, mask = -1, else mask = 0
+  int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4));
+  return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4));
+}
+
+inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale)
+{
+  const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv);
+  const int32x4_t vposend = vdupq_n_s32(max_scale);
+  const int32x4_t vnagend = vdupq_n_s32(-max_scale);
+
+  const int32x4x4_t rf = {{
+#ifdef __aarch64__
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#else  //__aarch64__
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#endif //__aarch64__
+  }};
+  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+  return vcombine_s8(pa, pb);
+}
+} // namespace
+
+NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
+    : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+{
+}
+
+void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output,
+                                              ITensor *scale_factor)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), output->info(), scale_factor->info()));
+
+  _input = input;
+  _output = output;
+  _scale_factor = scale_factor;
+
+  // Configure kernel window
+  Window win_config = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  INEKernel::configure(win_config);
+}
+
+Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                               const ITensorInfo *scale_factor)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor));
+
+  return Status{};
+}
+
+template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window)
+{
+  constexpr auto window_step = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
+#endif //__aarch64__
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  // Support Only 2D input
+  Window win_collapsed = window;
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+  const auto dim_x = _input->info()->dimension(0);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &id) {
+        const auto start = reinterpret_cast<const T *>(input.ptr());
+        const auto min_max = std::minmax_element(start, start + dim_x);
+        const auto int8_scale = 127;
+        auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+        if (range == 0)
+        {
+          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+          range = 1;
+        }
+        else
+        {
+          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+        }
+        const auto scale_factor_inv = int8_scale / range;
+
+        auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step); x += window_step)
+        {
+          wrapper::vstore(&output_ptr[x],
+                          vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+        }
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+          quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+          output_ptr[x] = static_cast<int8_t>(quantized);
+        }
+      },
+      input, output);
+}
+
+void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::F32:
+      NEQuantizationSymmetricKernel::quantize<float>(window);
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      NEQuantizationSymmetricKernel::quantize<float16_t>(window);
+      break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp
new file mode 100644
index 000000000..863316909
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+const std::pair<unsigned int, unsigned int>
+arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                             unsigned int kernel_width, unsigned int kernel_height,
+                                             const PadStrideInfo &info, unsigned int invalid_right,
+                                             unsigned int invalid_bottom)
+{
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+  const unsigned int padx = info.pad_left() + info.pad_right();
+  const unsigned int pady = info.pad_top() + info.pad_bottom();
+
+  ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
+  ARM_COMPUTE_ERROR_ON(kernel_width <= padx);
+  ARM_COMPUTE_ERROR_ON(kernel_height <= pady);
+
+  // Find the transpose conv out dimensions
+  // transpose conv out:
+  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
+  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
+  const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right;
+  const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom;
+
+  return std::make_pair<unsigned int, unsigned int>(w, h);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
new file mode 100644
index 000000000..158fe0b0c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/CLFunctionsEx.h"
+
+// NOTE This empty file aims to validate "CLFunctionsEx.h".
+//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
new file mode 100644
index 000000000..267228eac
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/Utils.h"
+
+namespace arm_compute
+{
+CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
+      _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+{
+}
+
+Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
+                                    const ReductionOperation &op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
+                                      op != ReductionOperation::ARG_IDX_MIN,
+                                  "Invalid reduction operation");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                  "Reduction axis greater than max number of dimensions");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+  const unsigned int num_of_stages =
+      calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+
+  DataType output_data_type = DataType::S32;
+  TensorInfo not_reshaped_output;
+  const auto input_num_channles = input->num_channels();
+  const auto input_qinfo = input->quantization_info();
+
+  if (output->total_size() != 0)
+  {
+    output_data_type = output->data_type();
+    const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis,
+                                                                   false));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+  }
+
+  auto shape_before_reshape = input->tensor_shape();
+  shape_before_reshape.set(axis, 1);
+  auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
+                                  int num_channels, QuantizationInfo qinfo) {
+    ti.set_data_type(data_type)
+        .set_tensor_shape(shape)
+        .set_num_channels(num_channels)
+        .set_quantization_info(qinfo);
+  };
+
+  initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
+                        input_num_channles, input_qinfo);
+
+  if (num_of_stages == 1)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
+  }
+  else
+  {
+    // Create temporary tensor infos
+    std::vector<TensorInfo> sums_vector(num_of_stages - 1);
+
+    // Create intermediate tensor info
+    TensorShape shape{input->tensor_shape()};
+
+    for (unsigned int i = 0; i < num_of_stages - 1; i++)
+    {
+      shape.set(0, ceil(shape.x() / 128.f));
+      sums_vector[i].set_data_type(input->data_type());
+      sums_vector[i].set_tensor_shape(shape);
+      sums_vector[i].set_num_channels(input->num_channels());
+    }
+
+    // Validate ReductionOperation only on first kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
+
+    // Validate ReductionOperation on intermediate stages
+    for (unsigned int i = 1; i < num_of_stages - 1; ++i)
+    {
+      ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1],
+                                                                     &sums_vector[i], axis, op));
+    }
+
+    // Validate ReductionOperation on the last stage
+    const unsigned int last_stage = num_of_stages - 1;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
+        input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
+  }
+  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+  return Status{};
+}
+
+void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output,
+                                   const ReductionOperation &op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+  _reduction_axis = axis;
+
+  const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
+      input->info()->tensor_shape(), axis, false);
+  DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
+                                  ? DataType::S32
+                                  : output->info()->data_type();
+  auto_init_if_empty(*output->info(), input->info()
+                                          ->clone()
+                                          ->set_tensor_shape(output_shape)
+                                          .set_data_type(output_data_type)
+                                          .reset_padding()
+                                          .set_is_resizable(true));
+
+  // Configure reduction operation kernels
+  _reduction_kernels_vector.resize(_num_of_stages);
+
+  _memory_group.manage(&_not_reshaped_output);
+  // Create temporary tensors
+  if (_num_of_stages == 1)
+  {
+    // Force an early initialization for int64 output type
+    TensorShape output_shape{input->info()->tensor_shape()};
+    output_shape.set(axis, 1);
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
+    _not_reshaped_output.info()->set_tensor_shape(output_shape);
+    _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
+  }
+  else
+  {
+    _results_vector.resize(_num_of_stages - 1);
+    TensorShape shape{input->info()->tensor_shape()};
+    for (unsigned int i = 0; i < _num_of_stages - 1; i++)
+    {
+      shape.set(0, ceil(shape.x() / 128.f));
+      _results_vector[i].allocator()->init(
+          input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
+    }
+
+    // Apply ReductionOperation only on first kernel
+    _memory_group.manage(&_results_vector[0]);
+    _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
+
+    // Apply ReductionOperation on intermediate stages
+    for (unsigned int i = 1; i < _num_of_stages - 1; ++i)
+    {
+      _memory_group.manage(&_results_vector[i]);
+      _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i],
+                                             axis, op);
+      _results_vector[i - 1].allocator()->allocate();
+    }
+
+    // Apply ReductionOperation on the last stage
+    const unsigned int last_stage = _num_of_stages - 1;
+    _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1],
+                                                    &_not_reshaped_output, axis, op);
+    _results_vector[last_stage - 1].allocator()->allocate();
+  }
+  _reshape_kernel.configure(&_not_reshaped_output, output);
+  _not_reshaped_output.allocator()->allocate();
+}
+
+void CLArgMinMaxLayerEx::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _num_of_stages; ++i)
+  {
+    CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+  }
+  CLScheduler::get().enqueue(_reshape_kernel, false);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..e5122ab8f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                  BinaryLogicalOperation op)
+{
+  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
new file mode 100644
index 000000000..c7d0ac8e2
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLCastBool.h"
+
+#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
+
+using namespace arm_compute;
+
+void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
new file mode 100644
index 000000000..3dede0562
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
+    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _flip_axis(),
+      _is_prepared(false)
+{
+}
+
+Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                            const ITensorInfo *bias, ITensorInfo *output,
+                                            const PadStrideInfo &info, unsigned int invalid_right,
+                                            unsigned int invalid_bottom,
+                                            const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+                                  "Output's width is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+                                  "Output's height is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+                                  "Output's depth is invalid.");
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(input->clone()
+                                ->set_is_resizable(true)
+                                .reset_padding()
+                                .set_tensor_shape(scale_out_shape)
+                                .set_data_layout(data_layout));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, weights_info));
+
+  return Status{};
+}
+
+void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  _original_weights = weights;
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(
+      *output->info(),
+      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _is_prepared = weights_info.retain_internal_weights();
+
+  _memory_group.manage(&_scaled_output);
+
+  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+  // to match output shape
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  // configure scale function
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, upsample_info);
+
+  // Setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+  _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
+                    weights_info);
+  _scaled_output.allocator()->allocate();
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  _flip_axis.map(true);
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  if (weights->info()->data_layout() == DataLayout::NHWC)
+  {
+    axis_data[0] = 1;
+    axis_data[1] = 2;
+  }
+  else
+  {
+    axis_data[0] = 0;
+    axis_data[1] = 1;
+  }
+  _flip_axis.unmap();
+}
+
+void CLDirectTransposeConvLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _scale_f.run();
+  _conv_f.run();
+}
+
+void CLDirectTransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _flip_weights.run();
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    // Free flipped weights
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..ae9d8afc6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+                                  const ICLTensor *lookups)
+{
+  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..01989461e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  ARM_COMPUTE_UNUSED(input);
+  ARM_COMPUTE_UNUSED(weights);
+  ARM_COMPUTE_UNUSED(output);
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+
+  return Status{};
+}
+} // namespace
+
+void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+                                                           const ITensorInfo *output)
+{
+  return CLTransposeKernel::validate(input, output);
+}
+
+CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
+      _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
+      _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
+      _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
+      _original_weights(nullptr)
+{
+}
+void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
+                                               ICLTensor *output, bool retain_internal_weights)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  ARM_COMPUTE_UNUSED(output);
+  ARM_COMPUTE_UNUSED(retain_internal_weights);
+  // Configure gemmlowp function
+  _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTensor *weights,
+                                            const ICLTensor *biases, ICLTensor *output,
+                                            FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _accumulate_biases = false;
+  _is_prepared = fc_info.retain_internal_weights;
+  _original_weights = weights;
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.set_target(CLScheduler::get().target());
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  const ICLTensor *weights_to_use = weights;
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  bool is_fc_after_conv = false;
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                   input->info()->tensor_shape().cend(),
+                                   output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(is_fc_after_conv,
+                           "CLFullyConnectedHybridLayer does not support after conv");
+  ARM_COMPUTE_UNUSED(is_fc_after_conv);
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_output.allocator()->init(
+        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+            compute_transposed_shape(*weights->info())));
+    _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Extract scale factor
+  _scale_factor.allocator()->init(
+      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
+  _memory_group.manage(&_scale_factor);
+  _scale_factor_kernel.configure(input, &_scale_factor);
+
+  // Quantize input
+  _quantized_input.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
+  _memory_group.manage(&_quantized_input);
+  _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
+
+  // GEMMLowp
+  _gemmlowp_output.allocator()->init(
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  _memory_group.manage(&_gemmlowp_output);
+  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
+               fc_info.retain_internal_weights);
+  _quantized_input.allocator()->allocate();
+
+  // Multiply scale
+  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+                                   weights->info()->quantization_info().uniform().scale);
+  _gemmlowp_output.allocator()->allocate();
+  _scale_factor.allocator()->allocate();
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                             const ITensorInfo *biases, const ITensorInfo *output,
+                                             FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+  const GPUTarget gpu_target = CLScheduler::get().target();
+
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1 && input->dimension(1) > 1;
+  }
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_fc_after_conv,
+                                  "CLFullyConnectedHybridLayer does not support after conv");
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  // Validate Scale factor kernel
+  const ITensorInfo &scale_factor =
+      TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
+  ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
+
+  // Validate quantization symm8 kernel
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+
+  // Fully Connected layer after a Fully Connected Layer without batches
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+  // Validate matrix multiply kernel
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+  // Multiply scale
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
+
+  return Status{};
+}
+
+void CLFullyConnectedHybridLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Extract scale_factor
+  CLScheduler::get().enqueue(_scale_factor_kernel);
+
+  // Quantize input
+  CLScheduler::get().enqueue(_quant_input_kernel);
+
+  // Run matrix multiply
+  _mm_gemmlowp.run();
+
+  // Multiply scale factor
+  CLScheduler::get().enqueue(_multiply_scale_kernel);
+
+  // Accumulate biases if provided
+  if (_accumulate_biases)
+  {
+    CLScheduler::get().enqueue(_accumulate_biases_kernel);
+  }
+}
+
+void CLFullyConnectedHybridLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](CLTensor *w) {
+      if (!w->is_used())
+      {
+        CLScheduler::get().queue().finish();
+        w->allocator()->free();
+      }
+    };
+
+    // Reshape of the weights if needed (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_kernel.run();
+
+      _are_weights_reshaped = true;
+      // We can not release _original_weights because it can be used in other nodes
+    }
+
+    // Prepare GEMM prepare and release unused weights
+    _mm_gemmlowp.prepare();
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..2ff4b9659
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::cast;
+
+namespace
+{
+Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights,
+                                       const ITensorInfo &output,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage)
+{
+  gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+  gemmlowp_output_stage.gemmlowp_offset = 0;
+  gemmlowp_output_stage.gemmlowp_multiplier = 0;
+  gemmlowp_output_stage.gemmlowp_shift = 0;
+
+  // Configure output stage for quantized case
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
+
+    const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
+
+    const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
+    int output_multiplier = 0;
+    int output_shift = 0;
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
+        multiplier, &output_multiplier, &output_shift));
+
+    // Set the GEMMLowp output stage info
+    gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+    gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+    gemmlowp_output_stage.gemmlowp_shift = output_shift;
+    gemmlowp_output_stage.gemmlowp_min_bound = 0;
+    gemmlowp_output_stage.gemmlowp_max_bound = 255;
+    gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+    gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+  }
+
+  return Status{};
+}
+
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias,
+                   const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
+{
+  GEMMLowpOutputStageInfo gemmlowp_output_stage;
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+
+  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+                                       false, // is_b_reshaped
+                                       true,  // reshape_b_only_on_first_run
+                                       0,     // depth_output_gemm3d
+                                       false, // reinterpret_input_as_3d
+                                       fc_info.retain_internal_weights, // retain_internal_weights
+                                       gemmlowp_output_stage,           // gemmlowp_output_stage
+                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       true,                            // broadcast_bias
+                                       ActivationLayerInfo());          // activation_info
+
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
+    const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
+
+    // Validate gemmlowp function
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+        &input.clone()->set_quantization_info(input_quantization_info),
+        &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
+        gemm_info));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
+  }
+
+  return Status{};
+}
+} // namespace
+
+void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
+{
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
+                                                       const ITensorInfo *output)
+{
+  return CLTransposeKernel::validate(input, output);
+}
+
+CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
+                                                 IWeightsManager *weights_manager)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
+      _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
+      _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
+      _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
+      _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+      _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+{
+}
+void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const FullyConnectedLayerInfo &fc_info)
+{
+  GEMMLowpOutputStageInfo gemmlowp_output_stage;
+  construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(),
+                                  gemmlowp_output_stage);
+
+  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+                                       false, // is_b_reshaped
+                                       true,  // reshape_b_only_on_first_run
+                                       0,     // depth_output_gemm3d
+                                       false, // reinterpret_input_as_3d
+                                       fc_info.retain_internal_weights, // retain_internal_weights
+                                       gemmlowp_output_stage,           // gemmlowp_output_stage
+                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       true,                            // broadcast_bias
+                                       ActivationLayerInfo());          // activation_info
+
+  if (_is_quantized)
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+    input->info()->set_quantization_info(QuantizationInfo(
+        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+    weights->info()->set_quantization_info(QuantizationInfo(
+        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+    // Configure gemmlowp function
+    _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
+
+    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+    // layers
+    input->info()->set_quantization_info(input_quantization_info);
+    weights->info()->set_quantization_info(weights_quantization_info);
+  }
+  else
+  {
+    // Configure matrix multiply kernel
+    _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
+  }
+}
+
+void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights,
+                                                const ICLTensor *bias, ICLTensor *output,
+                                                const FullyConnectedLayerInfo &fc_info)
+{
+  ARM_COMPUTE_ERROR_ON(
+      (weights->info()->dimension(1) !=
+       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+  // If the fully connected layer is called after a convolution layer, the input tensor must be
+  // linearized
+
+  // Initialize output tensor for flatten
+  TensorShape shape_flatten = compute_flatten_shape(input->info());
+  _flatten_output.allocator()->init(input->info()
+                                        ->clone()
+                                        ->set_is_resizable(true)
+                                        .reset_padding()
+                                        .set_tensor_shape(shape_flatten)
+                                        .set_data_layout(DataLayout::NCHW));
+
+  // Configure flatten kernel
+  _memory_group.manage(&_flatten_output);
+  _flatten_layer.configure(input, &_flatten_output);
+
+  // Configure matrix multiply kernel
+  configure_mm(&_flatten_output, weights, bias, output, fc_info);
+
+  // Allocate the output tensor for flatten once all the configure methods have been called
+  _flatten_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayerEx::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights,
+                                              const ICLTensor *bias, ICLTensor *output,
+                                              const FullyConnectedLayerInfo &fc_info)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure matrix multiply kernel
+  configure_mm(input, weights, bias, output, fc_info);
+}
+
+void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+                                        const ICLTensor *biases, ICLTensor *output,
+                                        FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_converted = true;
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _is_fc_after_conv = true;
+  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+  _is_prepared = fc_info.retain_internal_weights;
+  _original_weights = weights;
+
+  if (_weights_manager)
+  {
+    _weights_manager->manage(weights);
+  }
+
+  const ICLTensor *weights_to_use = weights;
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1;
+  }
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    if (_weights_manager && _weights_manager->are_weights_managed(weights))
+    {
+      _reshape_weights_managed_function.configure(weights);
+      weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
+          _weights_manager->acquire(weights, &_reshape_weights_managed_function));
+    }
+    else
+    {
+      // Reshape the weights
+      _reshape_weights_function.configure(weights, &_reshape_weights_output);
+      weights_to_use = &_reshape_weights_output;
+    }
+  }
+
+  // Convert weights if needed
+  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+  {
+    if (_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
+    {
+      _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(),
+                                         fc_info.weights_trained_layout);
+      weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
+          _weights_manager->acquire(weights, &_convert_weights_managed));
+    }
+    else
+    {
+      // Convert weights
+      _convert_weights.configure(weights_to_use, &_converted_weights_output,
+                                 input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+      weights_to_use = &_converted_weights_output;
+    }
+    _are_weights_converted = false;
+  }
+
+  if (_is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    configure_conv_fc(input, weights_to_use, biases, output, fc_info);
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    configure_fc_fc(input, weights_to_use, biases, output, fc_info);
+  }
+}
+
+Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                         const ITensorInfo *biases, const ITensorInfo *output,
+                                         FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+
+  const ITensorInfo &flatten_input = TensorInfo(input->clone()
+                                                    ->set_is_resizable(true)
+                                                    .reset_padding()
+                                                    .set_tensor_shape(compute_flatten_shape(input))
+                                                    .set_data_layout(DataLayout::NCHW));
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+  const ITensorInfo &converted_weights =
+      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                       : TensorInfo(*reshaped_weights.clone());
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *input_to_use = input;
+  const ITensorInfo *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1;
+  }
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Validate convert weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(
+        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+    weights_to_use = &converted_weights;
+  }
+
+  if (is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        (weights_to_use->dimension(1) !=
+         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+    // Validate flatten kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
+    input_to_use = &flatten_input;
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+  }
+
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
+
+  return Status{};
+}
+
+void CLFullyConnectedLayerEx::run()
+{
+  if (!_is_prepared)
+  {
+    if (!_are_weights_reshaped)
+      _reshape_weights_output.allocator()->allocate();
+    if (!_are_weights_converted)
+      _converted_weights_output.allocator()->allocate();
+    _is_prepared = true;
+  }
+
+  {
+    if (!_weights_manager)
+    {
+      ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+    }
+
+    // Pointer to current weights
+    const ICLTensor *cur_weights = _original_weights;
+    // Reshape of the weights
+    if (!_are_weights_reshaped)
+    {
+      if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+      {
+        _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>(
+            _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+      }
+      else
+      {
+        _reshape_weights_function.run();
+        cur_weights = &_reshape_weights_output;
+      }
+    }
+
+    // Convert weights if needed
+    if (!_are_weights_converted)
+    {
+      if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+      {
+        _weights_manager->run(cur_weights, &_convert_weights_managed);
+      }
+      else
+      {
+        _convert_weights.run();
+      }
+    }
+
+    // Prepare GEMM prepare
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+  }
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Linearize input if it comes from a convolutional layer
+  if (_is_fc_after_conv)
+  {
+    _flatten_layer.run();
+  }
+
+  // Run matrix multiply
+  if (_is_quantized)
+  {
+    _mm_gemmlowp.run();
+  }
+  else
+  {
+    _mm_gemm.run();
+  }
+}
+
+void CLFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+    if(!_is_prepared)
+    {
+        if(!_weights_manager)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        }
+
+        auto release_unused = [](CLTensor * w)
+        {
+            if(!w->is_used())
+            {
+                CLScheduler::get().queue().finish();
+                w->allocator()->free();
+            }
+        };
+
+        // Pointer to current weights
+        const ICLTensor *cur_weights = _original_weights;
+
+        // Reshape of the weights if needed (happens only once)
+        if(!_are_weights_reshaped)
+        {
+            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+            {
+                cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+            }
+            else
+            {
+                // Run reshape weights kernel and mark weights as unused
+                _reshape_weights_output.allocator()->allocate();
+                _reshape_weights_function.run();
+
+                cur_weights->mark_as_unused();
+                cur_weights = &_reshape_weights_output;
+            }
+            _are_weights_reshaped = true;
+        }
+
+        // Convert weights if needed (happens only once)
+        if(!_are_weights_converted)
+        {
+            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+            {
+                _weights_manager->run(cur_weights, &_convert_weights_managed);
+            }
+            else
+            {
+                _converted_weights_output.allocator()->allocate();
+                _convert_weights.run();
+                cur_weights->mark_as_unused();
+            }
+
+            _are_weights_converted = true;
+        }
+
+        // Release reshaped weights if unused
+        release_unused(&_reshape_weights_output);
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+        }
+
+        // Release converted weights if unused
+        release_unused(&_reshape_weights_output);
+        release_unused(&_converted_weights_output);
+
+        _is_prepared = true;
+    }
+#endif
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..157b4d977
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
+
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
+
+using namespace arm_compute;
+
+void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
+                                               const arm_compute::ICLTensor *weights,
+                                               const arm_compute::ICLTensor *biases,
+                                               arm_compute::ICLTensor *output, bool needs_reshape,
+                                               const arm_compute::TensorShape &reshape,
+                                               KernelType kernel_type)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  const ICLTensor *input_to_use = input;
+  if (_needs_reshape)
+  {
+    // reshape
+    auto_init_if_empty(*_cl_buffer.info(),
+                       _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
+                           _input->info()->data_layout()));
+    _cl_reshape.configure(_input, &_cl_buffer);
+    input_to_use = &_cl_buffer;
+  }
+
+  _cl_fc = [&]() {
+    if (kernel_type == KernelType::GENERAL)
+    {
+      auto fc = new arm_compute::CLFullyConnectedLayerEx{_memory_manager};
+      fc->configure(input_to_use, _weights, _biases, _output);
+      return std::unique_ptr<arm_compute::IFunction>(fc);
+    }
+    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
+    {
+      bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
+                        input->info()->data_type() == DataType::F16) &&
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
+
+      if (is_hybrid)
+      {
+        auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
+        fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+      else
+      {
+        auto fc = new arm_compute::CLFullyConnectedLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+    }
+    else
+    {
+      throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
+    }
+
+  }();
+
+  if (_needs_reshape)
+  {
+    // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _cl_buffer.allocator()->allocate();
+  }
+}
+
+void CLFullyConnectedReshapingLayer::run(void)
+{
+  if (_needs_reshape)
+    _cl_reshape.run();
+
+  _cl_fc->run();
+}
+
+void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
new file mode 100644
index 000000000..e0b833b04
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+using namespace arm_compute;
+
+void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+                           int axis)
+{
+  auto k = support::cpp14::make_unique<CLGatherExKernel>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return CLGatherExKernel::validate(input, indices, output, axis);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..65b89a389
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                  const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..5a7e40839
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+
+void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                               ICLTensor *gamma, ICLTensor *beta, float epsilon)
+{
+  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  k->configure(input, output, gamma, beta, epsilon);
+  _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                const ITensorInfo *gamma, const ITensorInfo *beta,
+                                                float epsilon)
+{
+  return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..28e5bc0da
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
new file mode 100644
index 000000000..aa9f32ec6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLOneHot.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
+namespace arm_compute
+{
+CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
+void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value,
+                         const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
+{
+  _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis);
+}
+void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+                         PixelValue off_value, int depth, int axis)
+{
+  _has_to_memset = true;
+  _memset_kernel.configure(output, off_value);
+  _onehot_kernel.configure(indices, on_value, output, depth, axis);
+}
+Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+                          const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+                          int axis)
+{
+  return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis);
+}
+void CLOneHot::run()
+{
+  if (_has_to_memset)
+  {
+    CLScheduler::get().enqueue(_memset_kernel, true);
+  }
+
+  CLScheduler::get().enqueue(_onehot_kernel, false);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..02ee4ad8a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+      _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const std::set<uint32_t> &axis, bool keep_dims,
+                                   const ReductionOperation &op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1);
+
+  // Create temporary tensor infos
+  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1, false);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+    interm_tensors[i].set_data_layout(input->data_layout());
+    interm_tensors[i].set_quantization_info(input->quantization_info());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceOperation only on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+  }
+
+  if (!keep_dims)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+  }
+
+  return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+                                  const std::set<uint32_t> &axis, bool keep_dims,
+                                  ReductionOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
+
+  _axis = axis;
+
+  _input = input;
+  _output = output;
+  _keep_dims = keep_dims;
+
+  // NOTE The axis must have no duplication.
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+  if (num_of_kernels < 1)
+  {
+    throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
+  }
+
+  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ReductionOperation on all kernels
+  TensorShape shape{input->info()->tensor_shape()};
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    shape.set(*it, 1, false);
+    if (!keep_dims || i != (num_of_kernels - 1))
+    {
+      _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+      _memory_group.manage(&_interm_tensors[i]);
+    }
+    _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+    if (i != 0)
+    {
+      _interm_tensors[i - 1].allocator()->allocate();
+    }
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output);
+    _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate();
+  }
+}
+
+void CLReduceOperation::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  const size_t num_of_kernels = _axis.size();
+  for (size_t i = 0; i < num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduce_kernels[i]);
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
new file mode 100644
index 000000000..a502f032e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSplitVEx.h"
+#include "support/ToolchainSupport.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include <cassert>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs,
+                          unsigned int num_splits)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1,
+                                  "size_splits must be a 1-D tensor.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(),
+                                  "Number of output tensors does not match number of splits.");
+  return Status{};
+}
+
+Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs,
+                       uint32_t split_dim)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions());
+  ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+  // Start/End coordinates
+  Coordinates start_coords;
+  Coordinates end_coords;
+  for (unsigned int d = 0; d < input->num_dimensions(); ++d)
+  {
+    end_coords.set(d, -1);
+  }
+  unsigned int axis_offset = 0;
+  // Validate output tensors
+  for (const auto &output : outputs)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    // Get output shape
+    const TensorShape output_shape = output->tensor_shape();
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+    const size_t axis_split_step = output_shape[split_dim];
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo tmp_output_info = *output->clone();
+    auto_init_if_empty(tmp_output_info,
+                       input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+    // Update coordinate on axis
+    start_coords.set(split_dim, axis_offset);
+    end_coords.set(split_dim, axis_offset + axis_split_step);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords));
+
+    axis_offset += axis_split_step;
+  }
+
+  return Status{};
+}
+
+void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs,
+                      std::vector<CLSlice> &_slice_functions, uint32_t split_dim)
+{
+  unsigned int axis_offset = 0;
+  // Start/End coordinates
+  Coordinates start_coords;
+  Coordinates end_coords;
+  for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d)
+  {
+    end_coords.set(d, -1);
+  }
+  int out_iter = 0;
+  for (const auto &output : outputs)
+  {
+    const TensorShape output_shape = output->info()->tensor_shape();
+    auto op_size = output_shape.total_size();
+    if (!op_size)
+    {
+      continue;
+    }
+
+    assert(op_size != 0);
+    assert(split_dim <= output_shape.num_dimensions());
+
+    const size_t axis_split_step = output_shape[split_dim];
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo tmp_output_info = *output->info()->clone();
+    auto_init_if_empty(
+        tmp_output_info,
+        input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+    // Update coordinate on axis
+    start_coords.set(split_dim, axis_offset);
+    end_coords.set(split_dim, axis_offset + axis_split_step);
+
+    // Configure slice function
+    _slice_functions[out_iter].configure(input, output, start_coords, end_coords);
+
+    // Set valid region from shape
+    outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+    axis_offset += axis_split_step;
+  }
+}
+
+} // namespace
+
+CLSplitVEx::CLSplitVEx()
+    : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
+{
+}
+
+void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
+                           const std::vector<ICLTensor *> &outputs, unsigned int num_splits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits));
+
+  _input = input;
+  _size_splits = size_splits;
+  _outputs = outputs;
+  _num_splits = num_splits;
+
+  // Create tensor slices
+  _slice_functions.resize(_num_splits);
+
+  // Extract output tensor info
+  std::vector<ITensorInfo *> outputs_info;
+  for (auto &output : _outputs)
+  {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    outputs_info.emplace_back(output->info());
+  }
+
+  // Validate slices
+  ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim));
+
+  // Configure slices
+  configure_slices(_input, _outputs, _slice_functions, split_dim);
+}
+
+void CLSplitVEx::run()
+{
+  // execute the slices
+  for (unsigned i = 0; i < _outputs.size(); ++i)
+  {
+    _slice_functions[i].run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..3ac95a8e6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+       _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+       _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+       _reorder_negatives_kernel(), _store_kernel()*/
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                         int total_bits, int bits)
+{
+  _total_bits = total_bits;
+  _bits = bits;
+  _n = input->info()->tensor_shape()[0];
+
+  // _total_bits should be divided by _bits.
+  ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+  _k = k;
+  _radix = 1 << bits;
+
+  _input = input;
+  _values = values;
+  _indices = indices;
+
+  std::string topk_env;
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+  }
+  else if (topk_env == "GPU")
+  {
+    // n should be divided by (_GROUPS * _ITEMS)
+    ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+    _hist_buf_size = _radix * _GROUPS * _ITEMS;
+    _glob_sum_buf_size = _HISTOSPLIT;
+
+    _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _hist_buf_size);
+    _glob_sum_buf =
+        cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                   sizeof(cl_int) * _glob_sum_buf_size);
+    _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _glob_sum_buf_size);
+    _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                                         CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+    _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _p_in_key_buf = &_in_key_buf;
+    _p_out_key_buf = &_out_key_buf;
+    _p_in_ind_buf = &_in_ind_buf;
+    _p_out_ind_buf = &_out_ind_buf;
+
+    _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+    _hist_kernel.configure(&_hist_buf, bits, _n);
+    _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+    _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _reorder_kernel.configure(&_hist_buf, bits, _n);
+    _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+    _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+    _store_kernel.configure(values, indices, k, _n);
+  }
+  else
+#endif // Disable GPU implementation
+  {
+    // DO NOTHING for CPU.
+  }
+}
+
+void CLTopKV2::run()
+{
+  std::string topk_env;
+#if 0
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    run_on_gpu_single_quicksort();
+  }
+  else if (topk_env == "GPU")
+  {
+    run_on_gpu();
+  }
+  else
+#endif
+  {
+    run_on_cpu();
+  }
+}
+
+#if 0
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+  // This is a single threaded quick sort implementation.
+  CLScheduler::get().enqueue(_qs_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  // 1. CLTopKV2Init set key buffer and index buffer.
+  //  - Key buffer is set as the same value of the layer's input
+  //  - Values in the index buffer are set as their indices.
+  CLScheduler::get().enqueue(_init_kernel, false);
+
+  int n_passes = _total_bits / _bits;
+
+  // 2. Repeat (total_bits/bits) times.
+  //   - total_bits is the number of bits of the data type (e.g., 32 for float)
+  //   - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+  for (int pass = 0; pass < n_passes; ++pass)
+  {
+    arm_compute::CLScheduler::get().sync();
+
+    // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+    _hist_kernel.setPass(pass, _p_in_key_buf);
+    CLScheduler::get().enqueue(_hist_kernel, false);
+
+    // 2.2. Calculate prefix sum locally with multiple threads
+    CLScheduler::get().enqueue(_scan_hist_kernel, false);
+    // 2.3. Calculate prefix sum within a work group
+    CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+    // 2.4. Calculate global prefix sum
+    CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+    // 2.5. Reorder keys and indices based on the global prefix sum
+    _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+    CLScheduler::get().enqueue(_reorder_kernel, false);
+
+    cl::Buffer *tmp;
+    // swap key buffers
+    tmp = _p_in_key_buf;
+    _p_in_key_buf = _p_out_key_buf;
+    _p_out_key_buf = tmp;
+
+    // swap index buffers
+    tmp = _p_in_ind_buf;
+    _p_in_ind_buf = _p_out_ind_buf;
+    _p_out_ind_buf = tmp;
+  }
+
+  // 3. Get the first negative index
+  // Because we swap in_buf and out_buf at the end of the above for loop,
+  // the output buffers are in bufs.
+  _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+  CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+  // 4. Correct odering of negatives
+  //   - Since radix sort does not consider negatives, negatives are considered as bigger values
+  //   than positives.
+  // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+  _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+                                       _p_out_ind_buf);
+  CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+  // 5. Extract top k values from sorted keys and indices.
+  _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+  CLScheduler::get().enqueue(_store_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+
+#if 0
+  // below code is left for debugging.
+  int first_neg;
+  q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+  std::cout << "first neg = " << first_neg << std::endl;
+
+  float in_key[_n];
+  q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+  }
+
+  float out_key[_n];
+  q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+  }
+
+  int in_ind[_n];
+  q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+  }
+
+  int out_ind[_n];
+  q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+  }
+
+  int hist_buf[_hist_buf_size];
+  q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+  for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+    std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+  }
+
+  int glob_sum_buf[_glob_sum_buf_size];
+  q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+  for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+    std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+  }
+
+#endif
+}
+#endif // Disable GPU implementation
+
+void CLTopKV2::run_on_cpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+  // const Window& w = _topkv2_kernel.window();
+
+  _input->map(q);
+  _values->map(q);
+  _indices->map(q);
+
+  // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+  int row_size = _input->info()->tensor_shape()[0];
+  int rank = _input->info()->num_dimensions();
+
+  if (rank > 2)
+    throw std::runtime_error("Not supported type.");
+
+  int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+                                         (int32 *)_indices->buffer(), (float *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::S32)
+  {
+    nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (int32_t *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::QASYMM8)
+  {
+    nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (uint8_t *)_values->buffer());
+  }
+  else
+  {
+    throw std::runtime_error("Not supported type.");
+  }
+
+  _input->unmap(q);
+  _values->unmap(q);
+  _indices->unmap(q);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
new file mode 100644
index 000000000..3215d01a7
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _function()
+{
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                                     ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                     const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                                     unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
+                                                         output->info(), deconv_info, invalid_right,
+                                                         invalid_bottom, weights_info))
+  {
+    case DeconvolutionMethod::DIRECT:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
+                   invalid_bottom, weights_info);
+      _function = std::move(f);
+      break;
+    }
+    case DeconvolutionMethod::GEMM:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      f->configure(compile_context, input, weights, bias, output, deconv_info);
+      _function = std::move(f);
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
+  }
+}
+
+Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, ITensorInfo *output,
+                                      const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  switch (CLTransposeConvLayer::get_deconvolution_method(
+      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
+  {
+    case DeconvolutionMethod::DIRECT:
+    {
+      // Validate direct convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
+          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+      break;
+    }
+    case DeconvolutionMethod::GEMM:
+    {
+      // Validate gemm-based convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(
+          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
+  }
+
+  return Status{};
+}
+
+DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
+    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+    unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_UNUSED(output, bias, weights_info);
+
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  if (weights->dimension(idx_w) != deconv_info.stride().first ||
+      weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
+      invalid_bottom != 0)
+  {
+    return DeconvolutionMethod::DIRECT;
+  }
+
+  return DeconvolutionMethod::GEMM;
+}
+
+void CLTransposeConvLayer::run()
+{
+  prepare();
+  _function->run();
+}
+
+void CLTransposeConvLayer::prepare() { _function->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
new file mode 100644
index 000000000..80fbf359d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/NEFunctionsEx.h"
+
+// NOTE This empty file aims to validate "NEFunctionsEx.h".
+//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
new file mode 100644
index 000000000..2fc94b267
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
+#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/MemorySupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation COP>
+void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+                                                    ITensor *output)
+{
+  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  k->configure(COP, input1, input2, output);
+  _kernel = std::move(k);
+}
+
+template <BinaryLogicalOperation COP>
+Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+                                                     const ITensorInfo *input2,
+                                                     const ITensorInfo *output)
+{
+  return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+                                         BinaryLogicalOperation op)
+{
+  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  k->configure(op, input1, input2, output);
+  _kernel = std::move(k);
+}
+
+Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                          const ITensorInfo *output, BinaryLogicalOperation op)
+{
+  return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
new file mode 100644
index 000000000..6ad3e1b12
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECastBool.h"
+
+#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
+#include "support/MemorySupport.h"
+
+using namespace arm_compute;
+
+void NECastBool::configure(const ITensor *input, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  return NECastBoolKernel::validate(input, output);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
new file mode 100644
index 000000000..e0ab3e025
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+#include "support/MemorySupport.h"
+
+using namespace arm_compute;
+
+void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+{
+  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..a123439d9
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+
+  return Status{};
+}
+} // namespace
+
+void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+{
+  auto k = support::cpp14::make_unique<NETransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+                                                           const ITensorInfo *output)
+{
+  return NETransposeKernel::validate(input, output);
+}
+
+NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+      _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+      _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+      _accumulate_biases(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights,
+                                               ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure gemmlowp function
+  _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights,
+                                            const ITensor *biases, ITensor *output,
+                                            FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _accumulate_biases = false;
+  _original_weights = weights;
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  bool _is_fc_after_conv;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
+                           "NEFullyConnectedHybridLayer does not support after conv");
+  (void)_is_fc_after_conv;
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_output.allocator()->init(
+        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+            compute_transposed_shape(*weights->info())));
+    _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Quantize input
+  _quantized_input.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
+  _scale_factor.allocator()->init(
+      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+  _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+
+  // GEMM
+  _gemmlowp_output.allocator()->init(
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
+
+  // Multiply scale
+  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+                                   weights->info()->quantization_info().uniform().scale);
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+
+  _quantized_input.allocator()->allocate();
+  _scale_factor.allocator()->allocate();
+  _gemmlowp_output.allocator()->allocate();
+}
+
+Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                             const ITensorInfo *biases, const ITensorInfo *output,
+                                             FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *weights_to_use = weights;
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  // Fully Connected layer after a Fully Connected Layer without batches
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+  // Validate quantization kernel
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
+  const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
+      &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
+
+  return Status{};
+}
+
+void NEFullyConnectedHybridLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Quantize input
+  NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY);
+
+  // Run matrix multiply
+  _mm_gemmlowp.run();
+
+  // Multiply scale factor
+  NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY);
+
+  // Accumulate biases if provided
+  if (_accumulate_biases)
+  {
+    NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+  }
+}
+
+void NEFullyConnectedHybridLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      _are_weights_reshaped = true;
+      // We can not release _original_weights because it can be used in other nodes
+    }
+
+    // Prepare GEMM prepare and release unused weights
+    _mm_gemmlowp.prepare();
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..cb7557a5a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info(input.quantization_info().uniform().scale,
+                                                   -input.quantization_info().uniform().offset);
+    const QuantizationInfo weights_quantization_info(weights.quantization_info().uniform().scale,
+                                                     -weights.quantization_info().uniform().offset);
+
+    // Validate gemmlowp function
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
+        &input.clone()->set_quantization_info(input_quantization_info),
+        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
+        &input, &weights, nullptr, &output, 1.f, 0.0f,
+        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+  }
+
+  return Status{};
+}
+} // namespace
+
+NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
+      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
+      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
+                                           ITensor *output)
+{
+  if (_is_quantized)
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+    input->info()->set_quantization_info(QuantizationInfo(
+        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+    weights->info()->set_quantization_info(QuantizationInfo(
+        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+    // Configure gemmlowp function
+    _mm_gemmlowp.configure(input, weights, nullptr, output);
+
+    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+    // layers
+    input->info()->set_quantization_info(input_quantization_info);
+    weights->info()->set_quantization_info(weights_quantization_info);
+  }
+  else
+  {
+    // Configure matrix multiply kernel
+    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
+                       GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+  }
+}
+
+void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
+                                                ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(
+      (weights->info()->dimension(1) !=
+       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+  // If the fully connected layer is called after a convolution layer, the input tensor must be
+  // linearized
+
+  // Initialize output tensor for flatten
+  TensorShape shape_flatten = compute_flatten_shape(input->info());
+  _flatten_output.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          shape_flatten));
+
+  // Configure flatten kernel
+  _memory_group.manage(&_flatten_output);
+  _flatten_kernel.configure(input, &_flatten_output);
+
+  // Configure matrix multiply kernel
+  configure_mm(&_flatten_output, weights, output);
+
+  // Allocate the output tensor for flatten once all the configure methods have been called
+  _flatten_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
+                                              ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure matrix multiply kernel
+  configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
+                                        const ITensor *biases, ITensor *output,
+                                        FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_converted = true;
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _is_fc_after_conv = true;
+  _accumulate_biases = false;
+  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+  _original_weights = weights;
+
+  // Configure gemmlowp output
+  if (_is_quantized)
+  {
+    _gemmlowp_output.allocator()->init(
+        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+            DataType::S32));
+  }
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !_is_quantized)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1;
+  }
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_function.configure(weights, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Convert weights if needed
+  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Convert weights
+    _convert_weights.configure(weights_to_use, &_converted_weights_output,
+                               input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+    weights_to_use = &_converted_weights_output;
+    _are_weights_converted = false;
+  }
+
+  ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+  if (_is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    configure_conv_fc(input, weights_to_use, tmp_output);
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    configure_fc_fc(input, weights_to_use, tmp_output);
+  }
+
+  // Configure output stage for asymmetric quantized types
+  if (_is_quantized)
+  {
+    float multiplier = input->info()->quantization_info().uniform().scale *
+                       weights->info()->quantization_info().uniform().scale /
+                       output->info()->quantization_info().uniform().scale;
+    int output_multiplier;
+    int output_shift;
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
+                                                               &output_shift);
+    _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
+                                     output_shift,
+                                     output->info()->quantization_info().uniform().offset);
+    _gemmlowp_output.allocator()->allocate();
+  }
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                         const ITensorInfo *biases, const ITensorInfo *output,
+                                         FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+  bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+  const ITensorInfo &flatten_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_flatten_shape(input)));
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+  const ITensorInfo &converted_weights =
+      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                       : TensorInfo(*reshaped_weights.clone());
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *input_to_use = input;
+  const ITensorInfo *weights_to_use = weights;
+  const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1;
+  }
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Validate convert weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
+        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+    weights_to_use = &converted_weights;
+  }
+
+  if (is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        (weights_to_use->dimension(1) !=
+         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+    // Validate flatten kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    input_to_use = &flatten_input;
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+  }
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+  // Validate output stage for asymmetric quantized types
+  if (is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
+        &gemmlowp_output, biases, output));
+  }
+
+  return Status{};
+}
+
+void NEFullyConnectedLayerEx::run()
+{
+  if (!_is_prepared)
+  {
+    if (!_are_weights_reshaped)
+      _reshape_weights_output.allocator()->allocate();
+    if (!_are_weights_converted)
+      _converted_weights_output.allocator()->allocate();
+    _is_prepared = true;
+  }
+
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Reshape of the weights
+    if (!_are_weights_reshaped)
+    {
+      _reshape_weights_function.run();
+    }
+
+    // Convert weights if needed
+    if (!_are_weights_converted)
+    {
+      _convert_weights.run();
+    }
+
+    // Prepare GEMM prepare
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+  }
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Linearize input if it comes from a convolutional layer
+  if (_is_fc_after_conv)
+  {
+    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+  }
+
+  // Run matrix multiply
+  if (_is_quantized)
+  {
+    _mm_gemmlowp.run();
+  }
+  else
+  {
+    _mm_gemm.run();
+  }
+
+  // Accumulate biases if provided
+  if (_is_quantized)
+  {
+    _gemmlowp_output_stage.run();
+  }
+  else
+  {
+    if (_accumulate_biases)
+    {
+      NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+  }
+}
+
+void NEFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Pointer to current weights
+    const ITensor *cur_weights = _original_weights;
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      cur_weights->mark_as_unused();
+      cur_weights = &_reshape_weights_output;
+      _are_weights_reshaped = true;
+    }
+
+    // Convert weights if needed (happens only once)
+    if (!_are_weights_converted)
+    {
+      _converted_weights_output.allocator()->allocate();
+      _convert_weights.run();
+
+      cur_weights->mark_as_unused();
+      _are_weights_converted = true;
+    }
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    // Prepare GEMM prepare and release unused weights
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+
+    // Release converted weights if unused
+    release_unused(&_reshape_weights_output);
+    release_unused(&_converted_weights_output);
+
+    _is_prepared = true;
+  }
+#endif
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..dc6c78478
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h"
+
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+
+using namespace arm_compute;
+
+void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input,
+                                               const arm_compute::ITensor *weights,
+                                               const arm_compute::ITensor *biases,
+                                               arm_compute::ITensor *output, bool needs_reshape,
+                                               const arm_compute::TensorShape &reshape,
+                                               KernelType kernel_type)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  const ITensor *input_to_use = input;
+  if (_needs_reshape)
+  {
+    // reshape
+    auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+    _neon_reshape.configure(_input, &_neon_buffer);
+    input_to_use = &_neon_buffer;
+  }
+
+  _neon_fc = [&]() {
+    if (kernel_type == KernelType::GENERAL)
+    {
+      auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager};
+      fc->configure(input_to_use, _weights, _biases, _output);
+      return std::unique_ptr<arm_compute::IFunction>(fc);
+    }
+    else
+    {
+      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+      bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
+
+      if (is_hybrid)
+      {
+        auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
+        fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+      else
+      {
+        auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+    }
+  }();
+
+  // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+  if (_needs_reshape)
+  {
+    _neon_buffer.allocator()->allocate();
+  }
+}
+
+void NEFullyConnectedReshapingLayer::run(void)
+{
+  if (_needs_reshape)
+    _neon_reshape.run();
+
+  _neon_fc->run();
+}
+
+void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
new file mode 100644
index 000000000..433c35d58
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+#include "support/MemorySupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return NEGatherKernelEx::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
new file mode 100644
index 000000000..52d58accf
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+#include "support/MemorySupport.h"
+
+using namespace arm_compute;
+
+void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+                                  ITensor *output, ITensor *hits)
+{
+  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
+
+Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                   const ITensorInfo *input, const ITensorInfo *output,
+                                   const ITensorInfo *hits)
+{
+  return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..16d74e62d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+      _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma,
+                                               ITensor *beta, float epsilon)
+{
+  const DataLayout data_layout = input->info()->data_layout();
+
+  // Configure Kernels
+  _is_nchw = data_layout == DataLayout::NCHW;
+
+  if (!_is_nchw)
+  {
+    _memory_group.manage(&_permuted_input);
+    _memory_group.manage(&_permuted_output);
+
+    // Configure the function to transform the input tensor from NHWC -> NCHW
+    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+    _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+    _permute_output.configure(&_permuted_output, output != nullptr ? output : input,
+                              PermutationVector(2U, 0U, 1U));
+    _permuted_input.allocator()->allocate();
+    _permuted_output.allocator()->allocate();
+  }
+  else
+  {
+    _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+  }
+}
+
+Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                const ITensorInfo *gamma, const ITensorInfo *beta,
+                                                float epsilon)
+{
+  return NEInstanceNormalizationLayerKernelEx::validate(
+      &input->clone()->set_data_layout(DataLayout::NCHW),
+      &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayerEx::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Permute input
+  if (!_is_nchw)
+  {
+    _permute_input.run();
+  }
+
+  NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+  // Permute output
+  if (!_is_nchw)
+  {
+    _permute_output.run();
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
new file mode 100644
index 000000000..275c55024
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEOneHot.h"
+#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
+#include "support/MemorySupport.h"
+#include <utility>
+namespace arm_compute
+{
+void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                         const ITensor *off_value, ITensor *output, int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>();
+  k->configure(indices, depth, on_value, off_value, output, axis);
+  _kernel = std::move(k);
+}
+Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth,
+                          const ITensorInfo *on_value, const ITensorInfo *off_value,
+                          const ITensorInfo *output, int axis)
+{
+  return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
new file mode 100644
index 000000000..cb1a26304
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                                   bool keep_dims, const ITensorInfo *output, ReductionOperation op)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                                  ITensor *output, ReductionOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels.resize(_reduction_ops);
+  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape =
+        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], op);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info()));
+      _memory_group.manage(&_reduced_outs[i]);
+      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+  }
+}
+
+void NEReduceOperation::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
new file mode 100644
index 000000000..26a887912
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                             bool keep_dims, const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                            ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels.resize(_reduction_ops);
+  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape =
+        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info())
+                                             .set_data_layout(input->info()->data_layout()));
+      _memory_group.manage(&_reduced_outs[i]);
+      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
+                                      ReductionOperation::SUM);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+  }
+}
+
+void NEReduceSum::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
new file mode 100644
index 000000000..aa165cc15
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+
+NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _conv_f(),
+      _upsample_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _weights_flipped(),
+      _flip_axis(),
+      _original_weights(nullptr),
+      _input(nullptr),
+      _info(),
+      _is_prepared(false)
+{
+}
+
+Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, const ITensorInfo *output,
+                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+                                                       DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+  const unsigned int width_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+  const unsigned int height_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+      weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+  }
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                    "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                    "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                    "Output's depth is invalid.");
+  }
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(
+      input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  const unsigned int batches_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+  const unsigned int channel_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
+                              scale_out_info.dimension(batches_idx));
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
+                              scale_out_info.dimension(channel_idx));
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, WeightsInfo()));
+
+  return Status{};
+}
+
+void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias,
+                                     ITensor *output, const PadStrideInfo &info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom)
+{
+  // Perform validation step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  const DataLayout data_layout = input->info()->data_layout();
+  const unsigned int width_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const unsigned int height_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+      weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+      invalid_right, invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  _input = input;
+  _original_weights = weights;
+  _info = info;
+  _is_prepared = false;
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+  _memory_group.manage(&_scaled_output);
+
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+  // setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  axis_data[0] = static_cast<uint32_t>(width_idx);
+  axis_data[1] = static_cast<uint32_t>(height_idx);
+
+  _scaled_output.allocator()->allocate();
+}
+
+void NETransposeConvLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _upsample_f.run();
+  _conv_f.run();
+}
+
+void NETransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _flip_weights.run();
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h
new file mode 100644
index 000000000..f94effea1
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/topk_v2.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file topk_v2.h
+ * @brief This file contains TopK method and TopContainer class for TopK operation
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+
+typedef int32_t int32;
+
+namespace nnfw
+{
+namespace rt
+{
+namespace optimized_ops
+{
+/**
+ * @brief class to define TopK operation
+ * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+ * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+ * TFLite.
+ * (TFLite additionaly supports kTfLiteInt64.)
+ *
+ * The class that collects top indexes of k values. Based on template
+ * tensorflow::gtl::TopN<> but, for optimization,
+ * it re-uses the same container.
+ */
+template <typename T> class TopContainer
+{
+public:
+  /**
+   * @brief Prevent default constructor of of this class
+   */
+  TopContainer() = delete;
+  /**
+   * @brief Constructor with params
+   * @param [in] row_size Size of row in data
+   * @param [in] k The top k predictions
+   */
+  TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
+  {
+    container_.reserve(std::min(k, row_size) + 1);
+  }
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   */
+  TopContainer(const TopContainer &) = delete;
+  /*
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   * @return Reference of TopContainer
+   */
+  TopContainer &operator=(const TopContainer &) = delete;
+
+  /**
+   * @brief Start collecting
+   * @param [in] values To set as values
+   * @return N/A
+   */
+  void start_collecting(const T *values)
+  {
+    values_ = values;
+    container_.clear();
+  }
+
+  /**
+   * @brief Push a value to be compared for topk
+   * @param [in] a A value to compare
+   * @return N/A
+   */
+  void push(int32 a)
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)k_)
+    {
+      container_.push_back(a);
+      if (container_.size() == (size_t)(k_ + 1))
+      {
+        std::make_heap(container_.begin(), container_.end(), comparator);
+        std::pop_heap(container_.begin(), container_.end(), comparator);
+      }
+    }
+    else if (comparator(a, container_.front()))
+    {
+      container_.back() = a;
+      std::push_heap(container_.begin(), container_.end(), comparator);
+      std::pop_heap(container_.begin(), container_.end(), comparator);
+    }
+  }
+
+  /**
+   * @brief Get sorted result from pushed values
+   * @return Reference of vector with sorted values
+   */
+  const std::vector<int32> &sorted_result()
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)(k_))
+    {
+      std::sort(container_.begin(), container_.end(), comparator);
+    }
+    else
+    {
+      std::sort_heap(container_.begin(), container_.end() - 1, comparator);
+      container_.resize(k_);
+    }
+    return container_;
+  }
+
+private:
+  int32 k_;
+  std::vector<int32> container_;
+  const T *values_ = nullptr;
+
+  bool compare_fun(int32 a, int32 b) const
+  {
+    if (values_[b] < values_[a])
+    {
+      return true;
+    }
+    else if (values_[b] > values_[a])
+    {
+      return false;
+    }
+    else
+    {
+      return a < b;
+    }
+  }
+};
+
+/**
+ * @brief Operates TopK operation with params
+ * @param [in] row_size Size of row in data
+ * @param [in] num_rows The number of rows in data
+ * @param [in] data To be operated in
+ * @param [in] k The top k predictions
+ * @param [out] output_indexes Indexes of targets in the top k predictions
+ * @param [out] output_values Values of targets in the top k predictions
+ * @return N/A
+ */
+template <typename T>
+void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
+          T *output_values)
+{
+  TopContainer<T> topc(k, row_size);
+  for (int row = 0; row < num_rows; ++row)
+  {
+    const T *values_row = data + row * row_size;
+    topc.start_collecting(values_row);
+    for (int32 c = 0; c < row_size; ++c)
+    {
+      topc.push(c);
+    }
+
+    // Prepare output buffers.
+    int32 *indexes_row = output_indexes + row * k;
+    T *output_row = output_values + row * k;
+    // We always assume that the output is sorted.
+    const auto &top_k = topc.sorted_result();
+    std::copy(top_k.begin(), top_k.end(), indexes_row);
+    std::transform(top_k.begin(), top_k.end(), output_row,
+                   [values_row](const int32 loc) { return values_row[loc]; });
+  }
+}
+
+} // namespace optimized_ops
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
new file mode 100644
index 000000000..5ea6cdadd
--- /dev/null
+++ b/compute/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectories()
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
new file mode 100644
index 000000000..09f67259c
--- /dev/null
+++ b/compute/cker/CMakeLists.txt
@@ -0,0 +1,19 @@
+nnfw_find_package(Eigen REQUIRED)
+nnfw_find_package(GEMMLowp REQUIRED)
+nnfw_find_package(Ruy REQUIRED)
+
+add_library(nnfw_lib_cker INTERFACE)
+target_link_libraries(nnfw_lib_cker INTERFACE eigen)
+target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
+target_link_libraries(nnfw_lib_cker INTERFACE ruy)
+target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
+target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
+if(PROFILE_RUY)
+  target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
+endif(PROFILE_RUY)
+
+target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+# Workaround to avoid warning
+# TODO Resolve warning
+target_compile_options(nnfw_lib_cker INTERFACE -Wno-attributes)
diff --git a/compute/cker/README.md b/compute/cker/README.md
new file mode 100644
index 000000000..3d98362ab
--- /dev/null
+++ b/compute/cker/README.md
@@ -0,0 +1,7 @@
+# cker
+
+cker - CPU kernel library
+
+__cker__ means `CPU kernel`
+
+Current __cker__ is porting of Tensorflow lite's operation kernel
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
new file mode 100644
index 000000000..e08040632
--- /dev/null
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_NEON_TENSOR_UTILS_H__
+#define __NNFW_CKER_NEON_TENSOR_UTILS_H__
+
+#include <ruy/path.h>
+#include <ruy/ruy.h>
+#include "cker/Types.h"
+#include "cker/neon/neon_check.h"
+#include "cker/ruy/RuySupport.h"
+#include "util/logging.h"
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+
+#ifdef USE_NEON
+
+#define kFloatWeightsPerNeonLane 4
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace
+{
+
+constexpr int kFloatValuesPerNeonVector = 4;
+
+// TODO(ahentz): Clean up.
+using int8 = std::int8_t;
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+
+template <int PerNeonSize> inline int RoundDownVectors(int size)
+{
+  return size & ~(PerNeonSize - 1);
+}
+
+// Allocates, at least, size bytes of uninitialized storage whose alignment is
+// specified by alignment. The size parameter must be an integral multiple of
+// alignment.
+// Caller is responsible by freeing the allocated memory by calling free on
+// the passed freeing_buffer pointer.
+void *aligned_alloc(size_t alignment, size_t size, void **freeing_buffer)
+{
+  *freeing_buffer = malloc(size + alignment);
+  const size_t offset = ((uintptr_t)*freeing_buffer) % alignment;                          // NOLINT
+  return offset == 0 ? *freeing_buffer : ((char *)*freeing_buffer + (alignment - offset)); // NOLINT
+}
+
+inline int32_t AccumulateNeonLane(const int32x4_t lane)
+{
+#ifdef __aarch64__
+  return vaddvq_s32(lane);
+#else
+  int64x2_t pairwiseAdded = vpaddlq_s32(lane);
+  return vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+#endif
+}
+
+} // namespace
+
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+inline bool DetectDotprodByLinuxAuxvMethod()
+{
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+inline bool DetectArmNeonDotprod()
+{
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
+
+  return false;
+}
+
+inline bool HasSdotInstruction()
+{
+  static const bool has_dotprod = DetectArmNeonDotprod();
+  return has_dotprod;
+}
+
+#ifdef __aarch64__
+// We interleave vector data to make the dot product logic more efficient.
+// Suppose that vectors is:
+//     a0 a1 a2 a3 a4 a5 ...
+//     b0 b1 b2 b3 b4 b5 ...
+//     c0 c1 c2 c3 c4 c5 ...
+//     d0 d1 d2 d3 d4 d5 ...
+//     e0 e1 e2 e3 e4 e5 ...
+// This code interleaves them like this:
+//     a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a4 a5 a6 a7 b4 ...
+//     e0 e1 e2 e3 f0 f1 f2 f3 ...
+// Once the data is interleaved, each 16-byte read from the vectors pointer
+// contains 4 bytes from each of 4 vectors.
+inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols,
+                                    void **shuffled_vectors_free)
+{
+  const int kWeightsPerUint32 = 4;
+
+  int8 *shuffled_vectors = reinterpret_cast<int8 *>(
+      aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
+
+  for (int i = 0; i < n_batch; i += 4)
+  {
+    int8 *shuffled_vectors_ptr = shuffled_vectors + (i * m_cols);
+    const int8 *unshuffled_vec0_ptr = reinterpret_cast<const int8 *>(vectors) + (i * m_cols);
+    const int8 *unshuffled_vec1_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 1) * m_cols);
+    const int8 *unshuffled_vec2_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 2) * m_cols);
+    const int8 *unshuffled_vec3_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 3) * m_cols);
+    const int8 *const end_vec0_ptr = unshuffled_vec1_ptr;
+
+    while (unshuffled_vec0_ptr != end_vec0_ptr)
+    {
+      asm volatile(
+          // This code path requires that (n_cols % 16) == 0 so we can safely
+          // read in 16-byte chunks from each row.
+          "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
+          "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
+          "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
+          "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
+
+          "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
+          "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
+          "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
+          "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
+
+          : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
+            [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
+            [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
+            [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
+            [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
+          :
+          : "v0", "v1", "v2", "v3", "cc", "memory");
+    }
+  }
+
+  return reinterpret_cast<const int8_t *>(shuffled_vectors);
+}
+
+// Notes about the speed of this version vs. the baseline (from memory):
+// - With 256K of L1, we can keep a lot of vectors in cache.
+//   I recall a reasonable speedup just by rearranging the loop to have
+//   row on the outside and batch on the inside.
+// - I also recall getting a nice speedup from sdot.
+// - I tried many times to do better than the current implementation, using
+//   loop unrolling and instruction reordering to avoid stalls, etc.
+//   but I was not able to do significantly better. This code is, however,
+//   much worse than what the processor spec sheet suggests is possible.
+static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                           const int m_rows, const int m_cols,
+                                                           const int8_t *vectors,
+                                                           const float *scaling_factors,
+                                                           int n_batch, float *__restrict__ result)
+{
+  void *shuffled_vectors_free;
+
+  const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
+
+  for (int row = 0; row < m_rows; row += 2)
+  {
+    for (int batch = 0; batch < n_batch; batch += 4)
+    {
+      float *result_ptr = result + (batch * m_rows) + row;
+      const int8 *mat_ptr0 = matrix + (row * m_cols);
+      const int8 *mat_ptr1 = matrix + ((row + 1) * m_cols);
+      const int8 *mat_ptr0_end = mat_ptr1;
+      const int8 *vec_ptr = shuffled_vectors + (batch * m_cols);
+      const float *scaling_factors_ptr = scaling_factors + batch;
+      const uint64_t wide_rows = m_rows * sizeof(float);
+      const int8 *mat_ptr2 = matrix + ((row + 2) * m_cols);
+      const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols);
+
+      asm volatile(
+          // Zero out the accumulator registers.
+          "dup v0.4s, wzr\n"
+          "dup v1.4s, wzr\n"
+          "dup v2.4s, wzr\n"
+          "dup v3.4s, wzr\n"
+
+          "1:\n" // batch_cols_loop
+
+          // Read 16 more bytes from a pair of matrix rows.
+          "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+
+          // Prefetch two rows ahead.
+          "prfm pldl1strm, [%[mat_ptr2]]\n"
+          "prfm pldl1strm, [%[mat_ptr3]]\n"
+
+          // Read from input vectors 4 times; 64 bytes total.
+          // Each 16-byte register contains parts of 4 vectors; see the
+          // shuffle logic above.
+
+          // From Benoit, places to look in the future:
+          // - Move load instructions further from sdot
+          // - Switch loop use-then-reload
+          // - Do partial unrolling to use register space better
+          "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+          "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+          "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+          "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+
+          // Update prefetch pointers.
+          "add %[mat_ptr2], %[mat_ptr2], #16\n"
+          "add %[mat_ptr3], %[mat_ptr3], #16\n"
+
+          // Re-use those vectors for the next row as well.
+          "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+          ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+          ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+          ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+          ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+
+          // If we're not done with these rows, continue.
+          "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+          "bne 1b\n" // batch_cols_loop
+
+          // Done with the rows, sum the results.
+          "add v0.4s, v0.4s, v1.4s\n"
+          "add v2.4s, v2.4s, v3.4s\n"
+
+          // Convert the per-vector sums to floating point.
+          "scvtf v0.4s, v0.4s\n"
+          "scvtf v1.4s, v2.4s\n"
+
+          // Fetch scale factors.
+          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+
+          // Multiply scale factors times sums.
+          "fmul v0.4s, v4.4s, v0.4s\n"
+          "fmul v1.4s, v4.4s, v1.4s\n"
+
+          // Load previous result values.
+          // The result position is:
+          //   result[batch * m_rows + row]
+          // Here that is factored into:
+          //   result_ptr = result + row
+          //   *result_ptr = res[0]
+          //   (uint8*)result_ptr += (m_rows * sizeof(float))
+          //   *result_ptr = res[1]
+          //   ...
+          // Since we're reading two rows at a time, though, we read both
+          //   result[batch * m_rows + row]
+          // and
+          //   result[batch * m_rows + row + 1]
+          "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+          "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+          "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+          "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+
+          // Go back to the starting position (subtract wide_rows * 4).
+          "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+
+          // Add previous result values.
+          "fadd v9.4s, v9.4s, v0.4s\n"
+          "fadd v10.4s, v10.4s, v1.4s\n"
+
+          // Store results.
+          "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+          "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+          "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+          "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+          : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
+            [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
+          : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
+            [wide_rows] "r"(wide_rows)
+          : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+            "v13", "cc", "memory");
+    }
+  }
+
+  free(shuffled_vectors_free);
+}
+
+static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
+    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+    const float *scaling_factors, int n_batch, float *__restrict__ result,
+    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+{
+  void *shuffled_vectors_free;
+  const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
+
+  for (int row = 0; row < m_rows; row += 2)
+  {
+    const float *channel_scales_ptr = per_channel_scale + row;
+    int32_t *row_sums_ptr = row_sums ? row_sums + row : nullptr;
+    for (int batch = 0; batch < n_batch; batch += 4)
+    {
+      float *result_ptr = result + (batch * m_rows) + row;
+      const int8 *mat_ptr0 = matrix + (row * m_cols);
+      const int8 *mat_ptr1 = matrix + ((row + 1) * m_cols);
+      const int8 *mat_ptr0_end = mat_ptr1;
+      const int8 *vec_ptr = shuffled_vectors + (batch * m_cols);
+      const float *scaling_factors_ptr = scaling_factors + batch;
+      const uint64_t wide_rows = m_rows * sizeof(float);
+      const int32_t *batch_offsets_ptr = input_offset + batch;
+      const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
+      const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
+      asm volatile("dup v0.4s, wzr\n"
+                   "dup v1.4s, wzr\n"
+                   "dup v2.4s, wzr\n"
+                   "dup v3.4s, wzr\n"
+                   // Load zero points.
+                   "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
+                   "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+                   // Zero out zero point accumulators.
+                   "dup v14.4s, wzr\n"
+                   "dup v15.4s, wzr\n"
+
+                   // Load per channel scales if not null.
+                   "cmp %w[is_channel_scale_nullptr], #0\n"
+                   "bne 1f\n"
+                   "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
+                   "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
+                   "fmul v16.4s, v16.4s, v4.4s\n"
+                   "fmul v17.4s, v17.4s, v4.4s\n"
+                   "b 2f\n"
+                   "1:\n"
+                   "mov v16.16b, v4.16b\n"
+                   "mov v17.16b, v4.16b\n"
+                   "2:\n"
+                   "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+                   "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+                   ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+                   "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+                   ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+                   "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+                   ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+                   "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+                   ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+                   "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+                   ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+                   ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+                   ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+                   ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+                   "cmp %w[is_row_sums_nullptr], #1\n"
+                   "bne 3f\n"
+                   // Accumulate row_sums for zero point calculations.
+                   "saddlp v12.8h, v12.16b\n"
+                   "saddlp v13.8h, v13.16b\n"
+                   "sadalp v14.4s, v12.8h\n"
+                   "sadalp v15.4s, v13.8h\n"
+                   "3:\n"
+                   "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+                   "bne 2b\n"
+                   "add v0.4s, v0.4s, v1.4s\n"
+                   "add v2.4s, v2.4s, v3.4s\n"
+
+                   "cmp %w[is_row_sums_nullptr], #1\n"
+                   "bne 4f\n"
+                   // Calculate zero point offsets.
+                   "addv s14, v14.4s\n"
+                   "addv s15, v15.4s\n"
+                   "dup v14.4s, v14.s[0]\n"
+                   "dup v15.4s, v15.s[0]\n"
+                   "b 5f\n"
+                   "4:\n"
+                   "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
+                   "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
+                   "5:\n"
+
+                   "mul v14.4s, v14.4s, v7.4s\n"
+                   "mul v15.4s, v15.4s, v7.4s\n"
+                   "sub v0.4s, v0.4s, v14.4s\n"
+                   "sub v2.4s, v2.4s, v15.4s\n"
+
+                   "scvtf v0.4s, v0.4s\n"
+                   "scvtf v1.4s, v2.4s\n"
+
+                   // Multiply scale.
+                   "fmul v0.4s, v16.4s, v0.4s\n"
+                   "fmul v1.4s, v17.4s, v1.4s\n"
+
+                   "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+                   "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+                   "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+                   "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+                   "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+                   "fadd v9.4s, v9.4s, v0.4s\n"
+                   "fadd v10.4s, v10.4s, v1.4s\n"
+                   "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+                   "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+                   "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+                   "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+                   : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
+                     [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr)
+                   : [mat_ptr0_end] "r"(mat_ptr0_end),
+                     [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows),
+                     [channel_scales_ptr] "r"(channel_scales_ptr),
+                     [batch_offsets_ptr] "r"(batch_offsets_ptr),
+                     [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
+                     [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
+                   : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+                     "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
+    }
+  }
+
+  free(shuffled_vectors_free);
+}
+
+// The DotprodMatrixBatchFourVectorMultiplyAccumulate kernel processes 4
+// vectors in the same time as the baseline processes 1 vector. However, it
+// requires 4 vectors of input.
+//
+// To take advantage of this speed difference, we add some zero-valued
+// vectors to the batch so that n_batch is a multiple of 4. Then we execute
+// DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate on that padded batch,
+// then extract just the results we want at the end (ignoring the extra padding
+// outputs).
+//
+// The relative cost of the padding is large when the matrix is smaller than
+// 128x128, so we don't use this code path on small matrices. On larger
+// matrices, the computation cost dwarfs the padding cost, making this code
+// viable.
+//
+// If we ignore the cost of padding, this kernel is:
+//    1x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 1
+//    2x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 2
+//    3x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 3
+//    ...
+//
+// We don't use this kernel when n_batch = 1 because the baseline kernel
+// is fine for that case.
+inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+    const float *scaling_factors, int n_batch, float *__restrict__ result,
+    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+{
+  const int kWeightsPerUint32 = 4;
+
+  // Round to the nearest multiple of 4.
+  int batch_round_up = n_batch;
+  if (n_batch % 4 != 0)
+  {
+    batch_round_up += (4 - n_batch % 4);
+  }
+  assert(n_batch <= batch_round_up);
+
+  void *padded_vectors_free;
+  const int padded_vectors_size = batch_round_up * m_cols;
+  int8_t *padded_vectors = reinterpret_cast<int8_t *>(
+      aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
+  memset(padded_vectors, 0, padded_vectors_size);
+
+  void *padded_result_free;
+  const int result_size = n_batch * m_rows * sizeof(float);
+  const int padded_result_size = batch_round_up * m_rows * sizeof(float);
+  float *padded_result = reinterpret_cast<float *>(
+      aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
+  memcpy(padded_result, result, result_size);
+  memset(reinterpret_cast<char *>(padded_result) + result_size, 0,
+         padded_result_size - result_size);
+
+  // Copy the input into the padded data structure.
+  assert(n_batch * m_cols <= padded_vectors_size);
+  memcpy(padded_vectors, vectors, n_batch * m_cols);
+
+  void *padded_scaling_factors_free;
+  const int padded_scaling_factors_size = batch_round_up * sizeof(float);
+  float *padded_scaling_factors = reinterpret_cast<float *>(
+      aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
+  assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size);
+  assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size);
+  memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
+  memcpy(padded_scaling_factors, scaling_factors, n_batch * sizeof(float));
+
+  if (input_offset != nullptr)
+  {
+    void *padded_input_offset_free;
+    const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
+    int32_t *padded_input_offset = reinterpret_cast<int32_t *>(
+        aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
+    assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size);
+    assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size);
+    memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
+    memcpy(padded_input_offset, input_offset, n_batch * sizeof(int32_t));
+
+    // Call the main kernel.
+    DotprodMatrixBatchFourVectorMultiplyAccumulate(
+        matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up,
+        padded_result, per_channel_scale, padded_input_offset, row_sums);
+
+    free(padded_input_offset_free);
+  }
+  else
+  {
+    // Call the main kernel.
+    DotprodMatrixBatchFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, padded_vectors,
+                                                   padded_scaling_factors, batch_round_up,
+                                                   padded_result);
+  }
+  memcpy(result, padded_result, result_size);
+
+  free(padded_result_free);
+  free(padded_vectors_free);
+  free(padded_scaling_factors_free);
+}
+
+inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+    const float *scaling_factors, int n_batch, float *__restrict__ result)
+{
+  DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
+#endif // __aarch64__
+
+inline void NeonCwiseClipping(float *vector, const int v_size, const float clipping_value)
+{
+  const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
+  const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
+
+  int i = 0;
+  for (; i <= v_size - kFloatValuesPerNeonVector; i += kFloatValuesPerNeonVector)
+  {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + i);
+    // Clip between clipping_value and -clipping_value.
+    v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
+    v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(vector + i, v_f32x4);
+  }
+  for (; i < v_size; i++)
+  {
+    vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
+  }
+}
+
+inline bool NeonIsZeroVector(const float *vector, int v_size)
+{
+  // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
+  // use the main vectorized loop, and we need to process sequentially.
+  // postamble_start shows the start index where this should happen.
+  const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
+  {
+    const float32x4_t i_x4_float = vld1q_f32(vector + v);
+    uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
+    if (vgetq_lane_u32(cmp_result, 0) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 1) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 2) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 3) == 0)
+      return false;
+  }
+
+  // Postamble loop
+  for (int v = postamble_start; v < v_size; ++v)
+  {
+    if (vector[v] != 0.0)
+      return false;
+  }
+  return true;
+}
+
+inline void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
+                               const int8_t *input_to_gate_weights, int32_t n_batch,
+                               int32_t n_input, int32_t n_output, int32_t, int32_t *scratch,
+                               ruy::Context *ruy_context)
+{
+  MatrixParams<int8_t> lhs_params;
+  lhs_params.order = Order::kRowMajor;
+  lhs_params.rows = n_output;
+  lhs_params.cols = n_input;
+  lhs_params.cache_policy = CachePolicy::kAlwaysCache;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = Order::kColMajor;
+  rhs_params.rows = n_input;
+  rhs_params.cols = n_batch;
+
+  MatrixParams<int32_t> dst_params;
+  dst_params.order = Order::kColMajor;
+  dst_params.rows = n_output;
+  dst_params.cols = n_batch;
+
+  GemmParams<int32_t, int32_t> gemm_params;
+  if (bias)
+  {
+    gemm_params.bias = bias;
+  }
+
+  // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
+  ruy::Matrix<int8_t> ruy_lhs;
+  ruy::Matrix<int8_t> ruy_rhs;
+  ruy::Matrix<int32_t> ruy_dst;
+  // Note that cache is always enabled for input and weight tensors
+  ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true);
+  ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true);
+  ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);
+
+  ruy::BasicSpec<int32_t, int32_t> ruy_mul_params;
+  ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
+
+  ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+}
+
+inline void NeonSub1Vector(const float *vector, int v_size, float *result)
+{
+  // If v_size is not divisible by the vector size, then we need to process the
+  // final few elements sequentially. postamble_start shows the start index
+  // where this should happen.
+  const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
+
+  float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+  int v = 0;
+  for (; v < postamble_start; v += kFloatValuesPerNeonVector)
+  {
+    // Load 4 float values from the current pointers of the input column and
+    // subtract from 1.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  for (; v < v_size; v++)
+  {
+    result[v] = 1.0f - vector[v];
+  }
+}
+
+inline void NeonSymmetricQuantizeFloats(const float *values, const int size,
+                                        int8_t *quantized_values, float *min, float *max,
+                                        float *scaling_factor)
+{
+  // TODO(raziel): vectorize min/max calculation.
+  auto minmax = std::minmax_element(values, values + size);
+  *min = *minmax.first;
+  *max = *minmax.second;
+  const int kScale = 127;
+  const float range = std::max(std::abs(*min), std::abs(*max));
+  if (range == 0)
+  {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = range / kScale;
+  const float scaling_factor_inv = kScale / range;
+
+  const int postamble_start = size - (size & (2 * kFloatWeightsPerNeonLane - 1));
+
+  // Vectorized constants.
+  const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
+  const float32x4_t point5_f32x4 = vmovq_n_f32(0.5);
+  const float32x4_t zero_f32x4 = vmovq_n_f32(0.0);
+  const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
+  const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
+
+  for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane)
+  {
+    // Implements the vectorized version of the following:
+    // const int32_t quantized_value = static_cast<int32>(
+    //    std::round(*scaling_factor * values[i]));
+    // Since the vectorized round intrinsics (vrndqa_f32) is not supported
+    // on all Neon flavors, we use the following method for rounding: if (x
+    // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5)
+    float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
+    float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
+    float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
+    float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
+
+    int32x4_t cmp_with_zero0_ui32x4 = (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4); // NOLINT
+    int32x4_t cmp_with_zero1_ui32x4 = (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4); // NOLINT
+
+    float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4);
+    float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4);
+    cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4);
+    cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4);
+
+    mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4);
+    mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4);
+
+    int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4);
+    int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4);
+
+    // Implements the vectorized version of the folowing block:
+    //  quantized_values[i] = std::min(kScale, std::max(-kScale,
+    //  quantized_value));
+    int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
+    int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
+    int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
+    int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
+
+    int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
+    int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
+
+    int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
+    int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
+    vst1_s8(&quantized_values[i], min_s8x8);
+  }
+
+  for (int i = postamble_start; i < size; ++i)
+  {
+    const int32_t quantized_value =
+        static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
+    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+  }
+}
+
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                    const int m_rows, const int m_cols,
+                                                    const int8_t *__restrict__ vectors,
+                                                    const float *scaling_factors, int n_batch,
+                                                    float *__restrict__ result, int result_stride)
+{
+#ifdef __aarch64__
+  if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 && m_rows >= n_batch)
+  {
+    if (n_batch % 4 == 0 && result_stride == 1)
+    {
+      // Benchmarks suggest that it's always better to use the batch code
+      // when we can, even on small matrices.
+      DotprodMatrixBatchFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
+                                                     scaling_factors, n_batch, result);
+      return;
+    }
+    else if (result_stride == 1 && n_batch >= 2 && m_rows * m_cols >= 128 * 128)
+    {
+      DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
+                                                           scaling_factors, n_batch, result);
+      return;
+    }
+  }
+#endif // __aarch64__
+
+  static const int kWeightsPerUint32 = 4;
+  static const int kWeightsPerNeonLane = 16;
+  // Assuming *matrix is kWeightsPerUint32-byte aligned,
+  // every row of the matrix is also
+  // kWeightsPerUint32-byte aligned as long as cols is
+  // a multiple of kWeightsPerUint32. The assumption
+  // is currently satisfied by TFLite's 16-byte memory
+  // alignment scheme.
+  //
+  // Otherwise, we allocate an aligned memory block and set
+  // a flag to later copy rows from matrix to the block
+  // for aligned multiplication.
+  bool unaligned = false;
+  int8_t *aligned_row = nullptr;
+  void *aligned_row_free = nullptr;
+  if ((m_cols & (kWeightsPerUint32 - 1)) != 0)
+  {
+    unaligned = true;
+    aligned_row = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT
+                                          &aligned_row_free);
+  }
+  void *aligned_vec_free = nullptr;
+  int8_t *aligned_vec = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT
+                                                &aligned_vec_free);
+
+  // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_half_start
+  // shows the start index where this should happen. Between postamble_start and
+  // postamble_half_start we can still process kWeightsPerNeonLane >> 1 in a
+  // vectorized form.
+  const int postamble_half_start = m_cols & ~(kWeightsPerNeonLane - 1);
+  const int postamble_start = m_cols & ~((kWeightsPerNeonLane >> 1) - 1);
+
+  for (int batch = 0; batch < n_batch; ++batch)
+  {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Copy the vector data to an aligned vector.
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
+    // Compute dot-product for every column.
+    for (int row = 0; row < m_rows; ++row, result += result_stride)
+    {
+      // Get the address of the first element of the row.
+      int8_t *row_ptr = (int8_t *)matrix + row * m_cols; // NOLINT
+      if (unaligned)
+      {
+        memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
+        row_ptr = aligned_row;
+      }
+
+      // Initialize the dot product sum for the row to 0.
+      int32x4_t dotprod_32x4 = vmovq_n_s32(0);
+
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
+
+      // For every block of 16 8-bit elements.
+      int col = 0;
+      for (; col < postamble_half_start; col += kWeightsPerNeonLane)
+      {
+        // Load 16 8-bit values from the row and vector, each, to operate on.
+        // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
+        // performance may suffer significantly.
+        assert( // NOLINT
+            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+        const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
+        const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
+        // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+        // registers).
+        int16x8_t prod_16x8 = vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+        // Multiply the high bits (i.e. the higher 8 8bit numbers in the
+        // registers), and accumulate with the result of the low bits product.
+        // The assumption here is that overflow will not happen as we quantize
+        // our values to be in the range [-127, 127]. As such the sum of the 2
+        // products is always strictly smaller than 15-bits (32767 in absolute
+        // value).
+        prod_16x8 = vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+        dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
+      } // for col
+
+      // Half iteration dealing only 8 elements
+      // TODO(raziel): if (ABSL_PREDICT_FALSE(col < postamble_start))
+      if (col < postamble_start)
+      {
+        // Load 8 8-bit values from the row and column each to operate on.
+        // Here the assumption is that each buffer is 4-bytes aligned.
+        // Otherwise, performance may suffer significantly.
+        assert( // NOLINT
+            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+        const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
+        const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
+        const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
+        dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
+        col += (kWeightsPerNeonLane >> 1);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this row.
+      int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
+      // Postamble loop.
+      // TODO(raziel): if (ABSL_PREDICT_FALSE(col < m_cols))
+      for (; col < m_cols; ++col)
+      {
+        dotprod += row_ptr[col] * aligned_vec[col];
+      } // for col
+
+      *result += dotprod * batch_scaling_factor;
+    } // for row
+  }   // for batch
+
+  if (unaligned)
+  {
+    free(aligned_row_free);
+  }
+  free(aligned_vec_free);
+}
+
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+                                                    const float *vector, int n_batch, float *result,
+                                                    int result_stride)
+{
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start = m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+
+  for (int b = 0; b < n_batch; b++)
+  {
+    float *result_in_batch = result + b * m_rows * result_stride;
+    const float *vector_in_batch = vector + b * m_cols;
+    const float *matrix_row = matrix;
+
+    // Main matrix by vector multiplication loop
+    for (int r = 0; r < m_rows; r++)
+    {
+      float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane)
+      {
+        // Load 4 float values from vector and matrix row.
+        float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c);
+        float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c);
+        // Multiply the vector and matrix row and add to accumulator.
+        acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this column.
+      *result_in_batch += (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+                           vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+      for (int c = postamble_start; c < m_cols; c++)
+      {
+        *result_in_batch += matrix_row[c] * vector_in_batch[c];
+      }
+      matrix_row += m_cols;
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                    const int m_rows, const int m_cols,
+                                                    const int8_t *__restrict__ vectors,
+                                                    const float *scaling_factors, int n_batch,
+                                                    int32_t *scratch, float *__restrict__ result,
+                                                    int result_stride, ruy::Context *ruy_context)
+{
+  if (m_rows % 4 == 0 && result_stride == 1)
+  {
+    const int32_t *bias = static_cast<const int32_t *>(nullptr);
+    NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
+                       /*output_zp =*/0, scratch, ruy_context);
+
+    // Multiply by float scaling factors and write to result
+    const int total_size = n_batch * m_rows;
+    int i = 0;
+    for (; i <= total_size - 8; i += 8, result += 8 * result_stride)
+    {
+      const float batch_scaling_factor0 = scaling_factors[i / m_rows];
+      const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
+      const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
+      const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+      const int32x4_t scratch_val0 = vld1q_s32(scratch + i);
+      const int32x4_t scratch_val1 = vld1q_s32(scratch + i + 4);
+      const float32x4_t float_val0 = vcvtq_f32_s32(scratch_val0);
+      const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
+      const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
+      const float32x4_t result1 =
+          vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+      vst1q_f32(result, result0);
+      vst1q_f32(result + 4 * result_stride, result1);
+    }
+    scratch += i;
+    for (; i < total_size; i++, result += result_stride)
+    {
+      const float batch_scaling_factor = scaling_factors[i / m_rows];
+      int32_t x = *(scratch++);
+      *result += x * batch_scaling_factor;
+    }
+    return;
+  }
+  NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, scaling_factors, n_batch,
+                                          result, result_stride);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // USE_NEON
+
+#endif // __NNFW_CKER_NEON_TENSOR_UTILS_H__
diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h
new file mode 100644
index 000000000..3b3b27f72
--- /dev/null
+++ b/compute/cker/include/cker/PortableTensorUtils.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__
+#define __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__
+
+#include "cker/Types.h"
+#include "cker/neon/neon_check.h"
+#include <ruy/context.h>
+
+#include <cstring>
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+class ActivationFunctor
+{
+public:
+  explicit ActivationFunctor(FusedActivationFunctionType act) : act_(act) {}
+
+  float operator()(float a) const
+  {
+    switch (act_)
+    {
+      case FusedActivationFunctionType::kNone:
+        return a;
+      case FusedActivationFunctionType::kRelu:
+        return a < 0.f ? 0.f : a;
+      case FusedActivationFunctionType::kRelu6:
+        return std::max(0.f, std::min(a, 6.f));
+      case FusedActivationFunctionType::kTanh:
+        return std::tanh(a);
+      case FusedActivationFunctionType::kSigmoid:
+        return 1.0f / (1.0f + std::exp(-a));
+      default:
+        // TODO(aselle): More informative fatal error!
+        exit(1);
+    }
+  }
+
+private:
+  FusedActivationFunctionType act_;
+};
+
+template <typename T>
+void PortableCwiseClipping(T *vector, const int v_size, const T clipping_value)
+{
+  for (int i = 0; i < v_size; i++)
+  {
+    vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value));
+  }
+}
+
+inline void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
+                                            float *batch_vector)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
+  }
+}
+
+inline void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch,
+                                         float *batch_vector)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    for (int i = 0; i < v_size; ++i)
+    {
+      batch_vector[i] += vector[i];
+    }
+    batch_vector += v_size;
+  }
+}
+
+inline bool PortableIsZeroVector(const float *vector, int v_size)
+{
+  for (int i = 0; i < v_size; ++i)
+  {
+    if (*vector++ != 0.0f)
+      return false;
+  }
+  return true;
+}
+
+inline void PortableApplyActivationToVector(const float *vector, int v_size,
+                                            FusedActivationFunctionType activation, float *result)
+{
+  auto activation_func = ActivationFunctor(activation);
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ = (activation_func)(*vector++);
+  }
+}
+
+inline void PortableSub1Vector(const float *vector, int v_size, float *result)
+{
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ = 1.0f - *vector++;
+  }
+}
+
+inline void PortableSymmetricQuantizeFloats(const float *values, const int size,
+                                            int8_t *quantized_values, float *min_value,
+                                            float *max_value, float *scaling_factor)
+{
+  auto minmax = std::minmax_element(values, values + size);
+  *min_value = *minmax.first;
+  *max_value = *minmax.second;
+  const int kScale = 127;
+  const float range = std::max(std::abs(*min_value), std::abs(*max_value));
+  if (range == 0)
+  {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = range / kScale;
+  const float scaling_factor_inv = kScale / range;
+  for (int i = 0; i < size; ++i)
+  {
+    const int32_t quantized_value =
+        static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
+    // Clamp: just in case some odd numeric offset.
+    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+  }
+}
+
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                        const int m_rows, const int m_cols,
+                                                        const int8_t *__restrict__ vectors,
+                                                        const float *scaling_factors, int n_batch,
+                                                        float *__restrict__ result,
+                                                        int result_stride)
+{
+  int batch, row, col;
+  for (batch = 0; batch < n_batch; ++batch, vectors += m_cols)
+  {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Get the address of the first row.
+    const int8_t *row_ptr = matrix;
+    for (row = 0; row < m_rows; ++row, result += result_stride)
+    {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
+#endif
+      for (col = 0; col < m_cols; ++col, ++row_ptr)
+      {
+        dotprod += (*row_ptr) * (vectors[col]);
+      } // for col
+      *result += (dotprod * batch_scaling_factor);
+    } // for row
+  }   // for batch
+}
+
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+                                                        const int m_rows, const int m_cols,
+                                                        const int8_t *__restrict__ vector,
+                                                        const float *scaling_factors, int n_batch,
+                                                        int32_t *, float *__restrict__ result,
+                                                        int result_stride, ruy::Context *)
+{
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
+                                              n_batch, result, result_stride);
+}
+
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+                                                        const float *vector, int n_batch,
+                                                        float *result, int result_stride)
+{
+  float *result_in_batch = result;
+  for (int b = 0; b < n_batch; b++)
+  {
+    const float *matrix_ptr = matrix;
+    for (int r = 0; r < m_rows; r++)
+    {
+      float dot_prod = 0.0f;
+      const float *vector_in_batch = vector + b * m_cols;
+      for (int c = 0; c < m_cols; c++)
+      {
+        dot_prod += *matrix_ptr++ * *vector_in_batch++;
+      }
+      *result_in_batch += dot_prod;
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+inline void PortableMeanStddevNormalization(const float *input_vector, float *output_vector,
+                                            int v_size, int n_batch)
+{
+  for (int batch = 0; batch < n_batch; ++batch)
+  {
+    float sum = 0.0f;
+    for (int i = 0; i < v_size; ++i)
+    {
+      sum += input_vector[i];
+    }
+    const float mean = sum / v_size;
+    float sum_diff_sq = 0.0f;
+    for (int i = 0; i < v_size; ++i)
+    {
+      const float diff = input_vector[i] - mean;
+      sum_diff_sq += diff * diff;
+    }
+    const float variance = sum_diff_sq / v_size;
+    constexpr float kNormalizationConstant = 1e-8f;
+    const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant);
+    for (int i = 0; i < v_size; ++i)
+    {
+      output_vector[i] = (input_vector[i] - mean) * stddev_inv;
+    }
+    input_vector += v_size;
+    output_vector += v_size;
+  }
+}
+
+inline void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); }
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__
diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h
new file mode 100644
index 000000000..86caf7d18
--- /dev/null
+++ b/compute/cker/include/cker/Shape.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SHAPE_H__
+#define __NNFW_CKER_SHAPE_H__
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+#include <vector>
+
+#define UNUSED_RELEASE(a) (void)(a)
+
+namespace nnfw
+{
+namespace cker
+{
+
+class Shape
+{
+public:
+  // Shapes with dimensions up to 5 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 5;
+
+  Shape &operator=(Shape const &) = delete;
+
+  Shape() : _size(0) {}
+
+  explicit Shape(int dimensions_count) : _size(dimensions_count)
+  {
+    if (dimensions_count > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[dimensions_count];
+    }
+  }
+
+  Shape(int shape_size, int32_t value) : _size(0)
+  {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i)
+    {
+      SetDim(i, value);
+    }
+  }
+
+  Shape(int dimensions_count, const int32_t *dims_data) : _size(0)
+  {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); }
+
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  Shape(Shape const &other) : _size(other.DimensionsCount())
+  {
+    if (_size > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[_size];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size);
+  }
+
+  bool operator==(const Shape &comp) const
+  {
+    return this->_size == comp._size &&
+           std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0;
+  }
+
+  ~Shape()
+  {
+    if (_size > kMaxSmallSize)
+    {
+      delete[] _dims_pointer;
+    }
+  }
+
+  inline int32_t DimensionsCount() const { return _size; }
+  inline int32_t Dims(int i) const
+  {
+    assert(i >= 0);
+    assert(i < _size);
+    return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i];
+  }
+  inline void SetDim(int i, int32_t val)
+  {
+    assert(i >= 0);
+    assert(i < _size);
+    if (_size > kMaxSmallSize)
+    {
+      _dims_pointer[i] = val;
+    }
+    else
+    {
+      _dims[i] = val;
+    }
+  }
+
+  inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+  inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+  // The caller must ensure that the shape is no bigger than 4-D.
+  inline const int32_t *DimsDataUpTo4D() const { return _dims; }
+
+  inline void Resize(int dimensions_count)
+  {
+    if (_size > kMaxSmallSize)
+    {
+      delete[] _dims_pointer;
+    }
+    _size = dimensions_count;
+    if (dimensions_count > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[dimensions_count];
+    }
+  }
+
+  inline void ReplaceWith(int dimensions_count, const int32_t *dims_data)
+  {
+    Resize(dimensions_count);
+    int32_t *dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
+  }
+
+  inline void ReplaceWith(const Shape &other)
+  {
+    ReplaceWith(other.DimensionsCount(), other.DimsData());
+  }
+
+  inline void ReplaceWith(Shape &&other)
+  {
+    Resize(0);
+    std::swap(_size, other._size);
+    if (_size <= kMaxSmallSize)
+      std::copy(other._dims, other._dims + kMaxSmallSize, _dims);
+    else
+      _dims_pointer = other._dims_pointer;
+  }
+
+  template <typename T> inline void BuildFrom(const T &src_iterable)
+  {
+    const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32_t *data = DimsData();
+    for (auto it : src_iterable)
+    {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static Shape ExtendedShape(int new_shape_size, const Shape &shape)
+  {
+    return Shape(new_shape_size, shape, 1);
+  }
+
+  inline void BuildFrom(const std::initializer_list<int> init_list)
+  {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  inline int FlatSize() const
+  {
+    int buffer_size = 1;
+    const int *dims_data = DimsData();
+    for (int i = 0; i < _size; i++)
+    {
+      const int dim = dims_data[i];
+      buffer_size *= dim;
+    }
+    return buffer_size;
+  }
+
+  bool operator!=(const Shape &comp) const { return !((*this) == comp); }
+
+private:
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0)
+  {
+    assert(new_shape_size >= shape.DimensionsCount());
+    assert(new_shape_size <= kMaxSmallSize);
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i)
+    {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32_t) * shape.DimensionsCount());
+  }
+
+  int32_t _size;
+  union {
+    int32_t _dims[kMaxSmallSize];
+    int32_t *_dims_pointer{nullptr};
+  };
+};
+
+inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
+{
+  UNUSED_RELEASE(shape2);
+  UNUSED_RELEASE(index2);
+  assert(shape1.Dims(index1) == shape2.Dims(index2));
+  return shape1.Dims(index1);
+}
+
+template <typename... Args>
+int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args)
+{
+  assert(shape1.Dims(index1) == shape2.Dims(index2));
+  UNUSED_RELEASE(shape2);
+  UNUSED_RELEASE(index2);
+  return MatchingDim(shape1, index1, args...);
+}
+
+inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); }
+
+inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3)
+{
+  assert(shape.DimensionsCount() == 4);
+  const int *dims_data = shape.DimsDataUpTo4D();
+  assert(i0 >= 0 && i0 < dims_data[0]);
+  assert(i1 >= 0 && i1 < dims_data[1]);
+  assert(i2 >= 0 && i2 < dims_data[2]);
+  assert(i3 >= 0 && i3 < dims_data[3]);
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int Offset(const Shape &shape, int *index)
+{
+  return Offset(shape, index[0], index[1], index[2], index[3]);
+}
+
+inline int FlatSizeSkipDim(const Shape &shape, int skip_dim)
+{
+  const int dims_count = shape.DimensionsCount();
+  assert(skip_dim >= 0 && skip_dim < dims_count);
+  const auto *dims_data = shape.DimsData();
+  int flat_size = 1;
+  for (int i = 0; i < dims_count; ++i)
+  {
+    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+  }
+  return flat_size;
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <typename... Ts> inline bool checkMatching(const Shape &shape, Ts... check_shapes)
+{
+  const Shape check_shapes_array[sizeof...(Ts)] = {std::forward<Ts>(check_shapes)...};
+  for (const auto &check_shape : check_shapes_array)
+  {
+    // Check matching of shapes except the case of that two shapes can be scalar
+    if (shape.DimensionsCount() > 1 || check_shape.DimensionsCount() > 1 || shape.FlatSize() != 1 ||
+        check_shape.FlatSize() != 1)
+    {
+      if (shape.DimensionsCount() != check_shape.DimensionsCount())
+      {
+        return false;
+      }
+      for (int i = 0; i < shape.DimensionsCount(); ++i)
+      {
+        if (shape.Dims(i) != check_shape.Dims(i))
+        {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+struct UNUSED_ALL
+{
+  template <typename... Args> UNUSED_ALL(Args const &...) {}
+};
+template <typename... Ts> inline int MatchingFlatSize(const Shape &shape, Ts... check_shapes)
+{
+  UNUSED_ALL{check_shapes...};
+  assert(checkMatching(shape, std::forward<Ts>(check_shapes)...));
+  return shape.FlatSize();
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0)
+{
+  UNUSED_RELEASE(check_shape_0);
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i)
+  {
+    if (i != skip_dim)
+    {
+      assert(shape.Dims(i) == check_shape_0.Dims(i));
+    }
+  }
+  return FlatSizeSkipDim(shape, skip_dim);
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0,
+                                   const Shape &check_shape_1)
+{
+  UNUSED_RELEASE(check_shape_0);
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i)
+  {
+    if (i != skip_dim)
+    {
+      assert(shape.Dims(i) == check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
+}
+
+inline int MatchingElementsSize(const Shape &shape, const Shape &check_shape_0,
+                                const Shape &check_shape_1)
+{
+  const int size_1 = shape.FlatSize();
+  const int size_2 = check_shape_0.FlatSize();
+  const int size_3 = check_shape_1.FlatSize();
+  assert(size_1 == size_2);
+  assert(size_2 == size_3);
+  UNUSED_RELEASE(size_2);
+  UNUSED_RELEASE(size_3);
+  return size_1;
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SHAPE_H__
diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h
new file mode 100644
index 000000000..bac79b887
--- /dev/null
+++ b/compute/cker/include/cker/TensorUtils.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TENSOR_UTILS_H__
+#define __NNFW_CKER_TENSOR_UTILS_H__
+
+#include "cker/Types.h"
+#include "cker/PortableTensorUtils.h"
+#include "cker/NeonTensorUtils.h"
+#include "cker/neon/neon_check.h"
+
+#include <cstring>
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void CwiseClipping(float *vector, const int v_size, const float clipping_value)
+{
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+
+inline void VectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector)
+{
+  PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
+}
+
+inline void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
+                                    float *batch_vector)
+{
+  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
+}
+
+// Cwise product of two vectors.
+template <typename T>
+inline void VectorVectorCwiseProduct(const T *__restrict__ vector1, const T *__restrict__ vector2,
+                                     int v_size, T *__restrict__ result)
+{
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ = *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+template <typename T>
+inline void VectorVectorCwiseProductAccumulate(const T *__restrict__ vector1,
+                                               const T *__restrict__ vector2, int v_size,
+                                               T *__restrict__ result)
+{
+  for (int v = 0; v < v_size; v++)
+  {
+    *result++ += *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product of a vector and a batch-vector.
+template <typename T>
+inline void VectorBatchVectorCwiseProduct(const T *vector, int v_size, const T *batch_vector,
+                                          int n_batch, T *result)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+template <typename T>
+inline void VectorBatchVectorCwiseProductAccumulate(const T *vector, int v_size,
+                                                    const T *batch_vector, int n_batch, T *result)
+{
+  for (int b = 0; b < n_batch; b++)
+  {
+    VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+inline bool IsZeroVector(const float *vector, int v_size)
+{
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
+inline void ApplyActivationToVector(const float *vector, int v_size,
+                                    FusedActivationFunctionType activation, float *result)
+{
+  PortableApplyActivationToVector(vector, v_size, activation, result);
+}
+
+inline void Sub1Vector(const float *vector, int v_size, float *result)
+{
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+inline void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
+                                    float *min, float *max, float *scaling_factor)
+{
+  return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max,
+                          scaling_factor);
+}
+
+inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows,
+                                                const int m_cols, const int8_t *vector,
+                                                const float *scaling_factors, int n_batch,
+                                                float *result, int result_stride)
+{
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector,
+                   scaling_factors, n_batch, result, result_stride);
+}
+
+inline void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+                                                const float *vector, int n_batch, float *result,
+                                                int result_stride)
+{
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch,
+                   result, result_stride);
+}
+
+inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows,
+                                                const int m_cols, const int8_t *vectors,
+                                                const float *scaling_factors, int n_batch,
+                                                int32_t *scratch, float *result, int result_stride,
+                                                ruy::Context *ruy_context)
+{
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors,
+                   scaling_factors, n_batch, scratch, result, result_stride, ruy_context);
+}
+
+inline void MeanStddevNormalization(const float *input_vector, float *output_vector, int v_size,
+                                    int n_batch)
+{
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+inline void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); }
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TENSOR_UTILS_H__
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
new file mode 100644
index 000000000..acb6cac55
--- /dev/null
+++ b/compute/cker/include/cker/Types.h
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TYPES_H__
+#define __NNFW_CKER_TYPES_H__
+
+#include <cstdint>
+#include <type_traits>
+#include <limits>
+#include <string>
+
+namespace nnfw
+{
+namespace cker
+{
+
+enum class FusedActivationFunctionType
+{
+  kNone = 0,
+  kRelu6 = 1,
+  kRelu1 = 2,
+  kRelu = 3,
+  kTanh = 4,
+  kSigmoid = 6,
+};
+enum class PaddingType
+{
+  kNone = 0,
+  kSame = 1,
+  kValid = 2,
+};
+
+enum class BinaryArithmeticOpType
+{
+  ADD = 0,
+  SUB = 1,
+  MUL = 2,
+  DIV = 3,
+  POW = 4,
+};
+
+enum class ComparisonOpType
+{
+  Equal,
+  NotEqual,
+  Greater,
+  GreaterEqual,
+  Less,
+  LessEqual
+};
+
+struct PaddingValues
+{
+  int16_t width;
+  int16_t height;
+};
+
+enum class BroadcastableOpCategory : uint8_t
+{
+  kNone,
+  kNonBroadcast,              // Matching input shapes.
+  kFirstInputBroadcastsFast,  // Fivefold nested loops.
+  kSecondInputBroadcastsFast, // Fivefold nested loops.
+  kGenericBroadcast,          // Fall-back.
+};
+
+struct PoolParams
+{
+  FusedActivationFunctionType activation;
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int stride_height;
+  int stride_width;
+  int filter_height;
+  int filter_width;
+  // uint8, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct SoftmaxParams
+{
+  // beta is not really used (not a Tensorflow parameter) and not implemented
+  // for LogSoftmax.
+  double beta;
+  int axis;
+  // uint8 inference params.  Used even when beta defaults to 1.0.
+  int32_t input_multiplier;
+  int32_t input_left_shift;
+  // Reverse scaling is only used by LogSoftmax.
+  int32_t reverse_scaling_divisor;
+  int32_t reverse_scaling_right_shift;
+  int diff_min;
+  int32_t zero_point;
+  float scale;
+  float *table;
+};
+
+struct PackParams
+{
+  int8_t axis;
+  // zeropoint and scale were only used to implement PackWithScaling in the legacy code of
+  // tensorflow
+  // const int32_t* input_zeropoint;
+  // const float* input_scale;
+  uint16_t inputs_count;
+  // int32_t output_zeropoint;
+  // float output_scale;
+};
+
+struct UnpackParams
+{
+  uint16_t num_split;
+  int16_t axis;
+};
+
+struct ConvParams
+{
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  bool is_replaced_weights{false};
+};
+
+struct ComparisonParams
+{
+  ComparisonOpType type;
+  int left_shift;
+  int input1_shift;
+  int input2_shift;
+  int32_t input1_offset;
+  int32_t input1_multiplier;
+  int32_t input2_offset;
+  int32_t input2_multiplier;
+  bool is_broadcast;
+};
+
+struct BinaryArithmeticOpParam
+{
+  // Shape dependent / common to data / op types.
+  BroadcastableOpCategory broadcast_category{BroadcastableOpCategory::kNone};
+  // uint8 inference params.
+  int32_t input1_offset = 0;
+  int32_t input2_offset = 0;
+  int32_t output_offset = 0;
+  int32_t output_multiplier = 0;
+  int32_t output_shift = 0;
+  // Add / Sub, not Mul, uint8 inference params.
+  int32_t left_shift = 0;
+  int32_t input1_multiplier = 0;
+  int32_t input1_shift = 0;
+  int32_t input2_multiplier = 0;
+  int32_t input2_shift = 0;
+  // uint8, etc, activation params.
+  int32_t quantized_activation_min = 0;
+  int32_t quantized_activation_max = 0;
+  // float activation params.
+  float float_activation_min = 0;
+  float float_activation_max = 0;
+
+  // Processed output dimensions.
+  // Let input "a" be the one that broadcasts in the faster-changing dimension.
+  // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
+  // {b0, b1, b2, b3, b4},
+  // broadcast_shape[4] = b0 = a0.
+  // broadcast_shape[3] = b1; a1 = 1.
+  // broadcast_shape[2] = b2 = a2.
+  // broadcast_shape[1] = a3; b3 = 1.
+  // broadcast_shape[0] = b4 = a4.
+  int broadcast_shape[5] = {};
+};
+
+struct TransposeParams
+{
+  int8_t perm_count;
+  int32_t perm[4];
+};
+
+struct ConcatenationParams
+{
+  int8_t axis;
+  const int32_t *input_zeropoint;
+  const float *input_scale;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
+  float output_scale;
+};
+
+struct DepthwiseConvParams
+{
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  int16_t depth_multiplier;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct FullyConnectedParams
+{
+  FusedActivationFunctionType activation{FusedActivationFunctionType::kNone};
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  float weights_scale;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params - no one use this params, but ruy might use them later.
+  // float float_activation_min;
+  // float float_activation_max;
+  // FullyConnectedWeightsFormat weights_format;
+};
+
+struct L2NormParams
+{
+  // uint8 inference params.
+  int32_t input_zero_point;
+};
+
+enum LSTMKernelType
+{
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+};
+
+struct LSTMParams
+{
+  // Parameters for LSTM version 1.
+  FusedActivationFunctionType activation{FusedActivationFunctionType::kNone};
+  float cell_clip;
+  float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  LSTMKernelType kernel_type;
+
+  // Parameters for LSTM version 4.
+  bool asymmetric_quantize_inputs;
+};
+
+struct GatherParams
+{
+  int32_t axis;
+};
+
+struct InstanceNormParams
+{
+  float epsilon;
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct ResizeBilinearParams
+{
+  int32_t output_height;
+  int32_t output_width;
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
+struct TransposeConvParams
+{
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct SliceParams
+{
+  int8_t begin_count;
+  int32_t begin[4];
+  int8_t size_count;
+  int32_t size[4];
+};
+
+struct StridedSliceParams
+{
+  int8_t start_indices_count;
+  int16_t start_indices[4];
+  int8_t stop_indices_count;
+  int16_t stop_indices[4];
+  int8_t strides_count;
+  int16_t strides[4];
+
+  int16_t begin_mask;
+  int16_t ellipsis_mask;
+  int16_t end_mask;
+  int16_t new_axis_mask;
+  int16_t shrink_axis_mask;
+};
+
+struct SplitParams
+{
+  uint16_t num_split;
+  int16_t axis;
+};
+
+struct SplitVParams
+{
+  uint16_t num_split;
+  int16_t axis;
+};
+
+struct FusedBatchNormParams
+{
+  bool is_training;
+  std::string data_format; // UNKNOWN(0), NHWC(1), NCHW(2)
+  float epsilon;
+};
+
+struct SpaceToBatchParams
+{
+  // "Zero" padding for uint8 means padding with the output offset.
+  int32_t output_offset;
+};
+
+struct SpaceToDepthParams
+{
+  int32_t block_size;
+};
+
+enum class Order
+{
+  kColMajor,
+  kRowMajor
+};
+
+enum class CachePolicy : std::uint8_t
+{
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
+// MatrixParams encapsulates the parameters that Gemm needs about each
+// matrix, besides the buffer data pointer.
+// Compare to ruy::Matrix, which also encapsulates the data pointer.
+// Rationale for leaving the data pointer out of here: doing so
+// requires complicated const-correctness mechanics. See
+// ruy::ConstCheckingPtr.
+template <typename Scalar> struct MatrixParams
+{
+  // Storage layout order. For now we only do plain linear non-strided
+  // layout. It would be easy to support a stride if needed.
+  Order order = Order::kColMajor;
+  // Number of rows of the matrix.
+  int rows = 0;
+  // Number of columns of the matrix.
+  int cols = 0;
+  // The zero_point, i.e. which Scalar value is to be interpreted as zero.
+  // When Scalar is floating-point, this must be 0.
+  Scalar zero_point = 0;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
+};
+
+// Enumeration of broad categories of Gemm.
+//
+// The primary reason for this to exist is to allow Gemm to compile
+// only uniform-quantized or only per-channel-quantized code paths.
+// This is unneeded with ruy as the back-end, as this is only a runtime
+// difference in ruy, but with gemmlowp these really are separate code
+// paths and templatizing in a QuantizationFlavor is necessary to avoid
+// compiling unused gemmlowp code. Indeed, TFLite currently uses
+// uint8 with uniform quantization and int8 with per-channel quantization,
+// and does not use uint8 with per-channel. We want to avoid compiling
+// the gemmlowp uint8 per-channel path when gemmlowp is the back-end.
+//
+// It's possible to drop this in the future if gemmlowp goes away and no
+// other then-relevant backend library handles quantized paths in a way that
+// requires knowing this at compile-time.
+enum class QuantizationFlavor
+{
+  // Floating-point Gemm: the accumulators are not multiplied by any
+  // 'multiplier'.
+  kFloatingPoint,
+  // Quantized Gemm using a single multiplier for all accumulators.
+  kIntegerWithUniformMultiplier,
+  // Quantized Gemm using a separate multipliers for accumulators of each
+  // row of the destination matrix. This is what is called 'per-channel'
+  // in GemmParams. Here we use the more specific 'per-row' terminology
+  // to allow for the possibility of 'per-column' in the future, and to
+  // allow for that to be a separate code path in some back-end such as
+  // gemmlowp.
+  kIntegerWithPerRowMultiplier
+};
+
+// Additional parameters that Gemm needs, beyond what falls into
+// the MatrixParams that it takes. Compare to ruy::Spec.
+//
+// Decoupling AccumScalar from DstScalar (rather than deducing it from that)
+// is useful future-proofing. Think of a float16 path using float32 accum.
+//
+// QuantizationFlavor is passed here even though it's technically not used
+// in this class. This is so that we retain the ability in the future to
+// specialize this class for quantization flavor, and this allows for
+// Gemm to be templatized in quantization_flavor via the GemmParams that it
+// takes, allowing for automatic template parameter deduction to take place,
+// so that most call sites don't need to specify a QuantizationFlavor
+// (only those that need perchannel quantization do).
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor =
+              std::is_floating_point<AccumScalar>::value
+                  ? QuantizationFlavor::kFloatingPoint
+                  : QuantizationFlavor::kIntegerWithUniformMultiplier>
+struct GemmParams
+{
+  // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
+  // of the multiplier by which accumulators are multiplied before being casted
+  // to the destination type.
+  AccumScalar multiplier_fixedpoint = 0;
+  // Only for non-floating-point cases. The exponent part of the aforementioned
+  // multiplier.
+  int multiplier_exponent = 0;
+  // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_fixedpoint.
+  const AccumScalar *multiplier_fixedpoint_perchannel = nullptr;
+  // Per-channel variant of multiplier_exponent. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_exponent.
+  //
+  // Either none or both of multiplier_exponent_perchannel and
+  // multiplier_fixedpoint_perchannel must be nullptr.
+  const int *multiplier_exponent_perchannel = nullptr;
+  // The bias vector data, if not null.
+  const AccumScalar *bias = nullptr;
+  // min clamp bound of destination values.
+  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+                            ? -std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::lowest();
+  // max clamp bound of destination values.
+  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+                            ? std::numeric_limits<DstScalar>::infinity()
+                            : std::numeric_limits<DstScalar>::max();
+};
+
+// Validates self-consistency of GemmParams.
+template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor>
+void ValidateGemmParams(const GemmParams<AccumScalar, DstScalar, quantization_flavor> &params)
+{
+  // Guard consistency of the quantized multiplier fields.
+  if (quantization_flavor == QuantizationFlavor::kFloatingPoint)
+  {
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  else if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier &&
+           !std::is_same<DstScalar, int32_t>::value)
+  {
+    assert(params.multiplier_fixedpoint);
+    // Nothing to check about multiplier_exponent
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  else if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier &&
+           !std::is_same<DstScalar, int32_t>::value)
+  {
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(params.multiplier_fixedpoint_perchannel);
+    assert(params.multiplier_exponent_perchannel);
+  }
+  else
+  {
+    // For the get raw accumulator case, we should make sure none of the
+    // quantization params are set.
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  UNUSED_RELEASE(params);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TYPES_H__
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
new file mode 100644
index 000000000..2abb998d0
--- /dev/null
+++ b/compute/cker/include/cker/Utils.h
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UTILS_H__
+#define __NNFW_CKER_UTILS_H__
+
+#include "Shape.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <fixedpoint/fixedpoint.h>
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
+{
+  return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
+}
+
+inline void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
+{
+  if (double_multiplier == 0.)
+  {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
+
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31))
+  {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  // A shift amount smaller than -31 would cause all bits to be shifted out
+  // and thus all results would be zero. We implement that instead with
+  // q_fixed==0, so as to avoid hitting issues with right-shift
+  // operations with shift amounts greater than 31. Note that this happens
+  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
+  // that we're effectively flushing tiny double_multiplier's to zero.
+  // We could conceivably handle values in the range (roughly) [32, 63]
+  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
+  // the present handling is just doing 'flush denormals to zero'. We could
+  // reconsider and actually generate nonzero denormals if a need arises.
+  if (*shift < -31)
+  {
+    *shift = 0;
+    q_fixed = 0;
+  }
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+inline void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
+                                                int32_t *quantized_multiplier, int *left_shift)
+{
+  assert(double_multiplier < 1.0);
+  assert(double_multiplier > 0.0);
+  int shift;
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
+  assert(shift <= 0);
+  *left_shift = shift;
+}
+
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
+{
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return gemmlowp::RoundingDivideByPOT(
+      gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+      right_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
+                                                           int left_shift)
+{
+  return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
+                                                              int32_t quantized_multiplier,
+                                                              int left_shift)
+{
+  return gemmlowp::RoundingDivideByPOT(
+      gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+  return (b * height + h) * width + w;
+}
+
+inline int CountLeadingZeros(uint32_t integer_input)
+{
+  const uint32_t one_in_leading_positive = 1U << 31;
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive)
+  {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+}
+
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t *output_inv_sqrt, int *output_shift)
+{
+  assert(input >= 0);
+  if (input <= 1)
+  {
+    // Handle the input value 1 separately to avoid overflow in that case
+    // in the general computation below (b/143972021). Also handle 0 as if it
+    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+    // but rare/unrealistic input value. We can expect both to occur in some
+    // incompletely trained models, but probably not in fully trained models.
+    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+    *output_shift = 0;
+    return;
+  }
+  assert(input > 1);
+  *output_shift = 11;
+  while (input >= (1 << 29))
+  {
+    input /= 4;
+    ++*output_shift;
+  }
+  const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  assert(input >= (1 << 27));
+  assert(input < (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++)
+  {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0)
+  {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
+// Comment from tensorflow lite:
+//
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N> struct NdArrayDesc
+{
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// Comment from tensorflow lite:
+//
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
+{
+  assert(i0 >= 0 && i0 < desc.extents[0]);
+  assert(i1 >= 0 && i1 < desc.extents[1]);
+  assert(i2 >= 0 && i2 < desc.extents[2]);
+  assert(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
+}
+
+template <int N> inline int SubscriptToIndexGeneric(const NdArrayDesc<N> *desc, int *iter)
+{
+  int ret_indx = 0;
+  for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
+  {
+    assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
+    ret_indx += iter[idx] * desc->strides[idx];
+  }
+
+  return ret_indx;
+}
+
+// Copies dims to desc, calculating strides.
+template <int N> inline void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc<N> *desc_out)
+{
+  int desc_stride = 1;
+  for (int i = N - 1; i >= 0; --i)
+  {
+    desc_out->extents[i] = input_shape.Dims(i);
+    desc_out->strides[i] = desc_stride;
+    desc_stride *= input_shape.Dims(i);
+  }
+}
+
+template <int N>
+inline void
+NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
+                                    NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
+{
+  assert(desc0_out != nullptr);
+  assert(desc1_out != nullptr);
+
+  auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
+  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i)
+  {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    if (extent0 != extent1)
+    {
+      if (extent0 == 1)
+      {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      }
+      else
+      {
+        assert(extent1 == 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+template <int N>
+inline void
+NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
+                                    const Shape &input2_shape, NdArrayDesc<N> *desc0_out,
+                                    NdArrayDesc<N> *desc1_out, NdArrayDesc<N> *desc2_out)
+{
+  assert(desc0_out != nullptr);
+  assert(desc1_out != nullptr);
+  assert(desc2_out != nullptr);
+
+  auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
+  auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
+  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
+  CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i)
+  {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    const int extent2 = extended_input2_shape.Dims(i);
+
+    int extent = extent0;
+    if (extent1 != 1)
+      extent = extent1;
+    if (extent2 != 1)
+      extent = extent2;
+
+    assert(extent0 == 1 || extent0 == extent);
+    assert(extent1 == 1 || extent1 == extent);
+    assert(extent2 == 1 || extent2 == extent);
+
+    if (!(extent0 == extent1 && extent1 == extent2))
+    {
+      if (extent0 == 1)
+      {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent;
+      }
+      if (extent1 == 1)
+      {
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent;
+      }
+      if (extent2 == 1)
+      {
+        desc2_out->strides[i] = 0;
+        desc2_out->extents[i] = extent;
+      }
+    }
+  }
+}
+
+// Gets next index to iterate through a multidimensional array.
+inline bool NextIndex(const int num_dims, const int *dims, int *current)
+{
+  if (num_dims == 0)
+  {
+    return false;
+  }
+  assert(dims != nullptr);
+  assert(current != nullptr);
+  int carry = 1;
+  for (int idx = num_dims - 1; idx >= 0; --idx)
+  {
+    int current_val = current[idx] + carry;
+    assert(dims[idx] >= current_val);
+    if (dims[idx] == current_val)
+    {
+      current[idx] = 0;
+    }
+    else
+    {
+      current[idx] = current_val;
+      carry = 0;
+      break;
+    }
+  }
+  return (carry == 0);
+}
+
+// Gets offset of index if reducing on axis. When reducing, the flattened offset
+// will not change, if the input index changes on the given axis. For example,
+// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
+// offset.
+// TODO(kanlig): uses Dims to represent dimensions.
+inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
+                                  const int num_axis, const int *axis)
+{
+  if (num_dims == 0)
+  {
+    return 0;
+  }
+
+  assert(dims != nullptr);
+  assert(index != nullptr);
+
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx)
+  {
+    // if we need to skip this axis
+    bool is_axis = false;
+    if (axis != nullptr)
+    {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+      {
+        if (idx == axis[axis_idx])
+        {
+          is_axis = true;
+          break;
+        }
+      }
+    }
+    if (!is_axis)
+    {
+      offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
+    }
+  }
+  return offset;
+}
+
+template <typename T> void optimized_ops_preload_l1_keep(const T *ptr)
+{
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
+// Writes randomly accessed values from `input` sequentially into `output`.
+template <typename T> class SequentialTensorWriter
+{
+public:
+  SequentialTensorWriter(const T *input_data, T *output_data)
+      : input_data_(input_data), output_ptr_(output_data)
+  {
+  }
+
+  void Write(int position) { *output_ptr_++ = input_data_[position]; }
+  void WriteN(int position, int len)
+  {
+    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
+    output_ptr_ += len;
+  }
+
+private:
+  const T *input_data_;
+  T *output_ptr_;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UTILS_H__
diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h
new file mode 100644
index 000000000..49c34211a
--- /dev/null
+++ b/compute/cker/include/cker/eigen/EigenSupport.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_EIGEN_SUPPORT_H__
+#define __NNFW_CKER_EIGEN_EIGEN_SUPPORT_H__
+
+//#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include <Eigen/Core>
+#include <thread>
+#include "cker/eigen/eigen_spatial_convolutions.h"
+
+#ifdef EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/ThreadPool>
+#endif
+
+namespace nnfw
+{
+namespace cker
+{
+namespace eigen_support
+{
+
+// Shorthands for the types we need when interfacing with the EigenTensor
+// library.
+typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
+                         Eigen::Aligned>
+    EigenMatrix;
+typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
+                         Eigen::Aligned>
+    ConstEigenMatrix;
+
+typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
+                         Eigen::Aligned>
+    EigenTensor;
+typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
+                         Eigen::Aligned>
+    ConstEigenTensor;
+
+// Utility functions we need for the EigenTensor API.
+template <typename Device, typename T> struct MatMulConvFunctor
+{
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(const Device &d, EigenMatrix out, ConstEigenMatrix in0, ConstEigenMatrix in1,
+                  const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> &dim_pair)
+  {
+    out.device(d) = in0.contract(in1, dim_pair);
+  }
+};
+
+// We have a single global threadpool for all convolution operations. This means
+// that inferences started from different threads may block each other, but
+// since the underlying resource of CPU cores should be consumed by the
+// operations anyway, it shouldn't affect overall performance.
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface
+{
+public:
+  // Takes ownership of 'pool'
+  explicit EigenThreadPoolWrapper(Eigen::ThreadPool *pool) : pool_(pool) {}
+  ~EigenThreadPoolWrapper() override {}
+
+  void Schedule(std::function<void()> fn) override { pool_->Schedule(std::move(fn)); }
+  int NumThreads() const override { return pool_->NumThreads(); }
+  int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+private:
+  std::unique_ptr<Eigen::ThreadPool> pool_;
+};
+
+struct EigenContext
+{
+  constexpr static int default_num_threadpool_threads = 4;
+  std::unique_ptr<Eigen::ThreadPoolInterface> thread_pool_wrapper;
+  std::unique_ptr<Eigen::ThreadPoolDevice> device;
+
+  EigenContext()
+  {
+    int num_threads = std::thread::hardware_concurrency();
+    if (num_threads == 0)
+    {
+      num_threads = default_num_threadpool_threads;
+    }
+    device.reset(); // destroy before we invalidate the thread pool
+    thread_pool_wrapper.reset(new EigenThreadPoolWrapper(new Eigen::ThreadPool(num_threads)));
+    device.reset(new Eigen::ThreadPoolDevice(thread_pool_wrapper.get(), num_threads));
+  }
+
+  static inline EigenContext &GetEigenContext()
+  {
+    static EigenContext instance;
+    return instance;
+  }
+};
+
+inline const Eigen::ThreadPoolDevice *GetThreadPoolDevice()
+{
+  auto &ctx = EigenContext::GetEigenContext();
+  return ctx.device.get();
+}
+
+} // namespace eigen_support
+} // namespace cker
+} // namespace nnfw
+
+//#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_EIGEN_EIGEN_SUPPORT_H__
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
new file mode 100644
index 000000000..f9c706370
--- /dev/null
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_UTILS_H__
+#define __NNFW_CKER_EIGEN_UTILS_H__
+
+#include <Eigen/Core>
+#include <type_traits>
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen vector expression. The std::conditional here is to
+// construct the suitable Eigen type for the constness of the
+// data. Indeed, for const data, we need to produce
+//    Eigen::Map<const Eigen::Matrix<float, ...>>
+// and not the more straightforward
+//    Eigen::Map<Eigen::Matrix<const float, ...>>
+template <typename Scalar>
+using VectorMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+
+template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape)
+{
+  const int size = shape.FlatSize();
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
+                                   Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
+{
+  const int dims_count = shape.DimensionsCount();
+  const int rows = shape.Dims(dims_count - 1);
+  const int cols = FlatSizeSkipDim(shape, dims_count - 1);
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EIGEN_UTILS_H__
diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
new file mode 100644
index 000000000..dc3e2552d
--- /dev/null
+++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_EIGEN_CONVOLUTION_HELPERS_H__
+#define __NNFW_CKER_EIGEN_EIGEN_CONVOLUTION_HELPERS_H__
+
+namespace Eigen
+{
+namespace internal
+{
+
+// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
+// provides `value` that is true if TensorEvaluatorType has `PacketType
+// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
+// const` and if the PacketType supports masked load.
+//
+// Partial packets are used to:
+//
+// 1) Split the packet over two columns in eigen based spatial convolution and
+// use partial loads for each individual part before combining them to get the
+// required packet. This class is used to pick the correct implementation of
+// loadPacketStandard function.
+//
+// 2) Split the packet over two rows (within the same column) in eigen based
+// cuboid convolution and use partial loads for each individual part before
+// combining them to get the required packet. This class is used to pick the
+// correct implementation of loadPacketStandard function. This usage is similar
+// to the usage in eigen based spatial convolution described above.
+//
+// 3) Finalize packing of columns in gemm_pack_colmajor after processing
+//    vectorized part with full packets (see eigen_spatial_convolutions.h).
+template <typename TensorEvaluatorType, typename PacketType, typename IndexType>
+class TensorEvaluatorHasPartialPacket
+{
+public:
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(
+      typename std::enable_if<
+          unpacket_traits<PacketT>::masked_load_available &&
+          std::is_same<
+              PacketT,
+              decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
+                  std::declval<IndexT>(),
+                  std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
+      -> std::true_type;
+
+  template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
+  static auto functionExistsSfinae(...) -> std::false_type;
+
+  typedef decltype(
+      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
+
+  static constexpr bool value = status::value;
+};
+
+// Compute a mask for loading/storing coefficients in/from a packet in a
+// [from, to) range. If the mask bit is 1, element will be loaded/stored.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                            typename unpacket_traits<Packet>::mask_t>::type
+    mask(int from, int to)
+{
+  const Index packet_size = internal::unpacket_traits<Packet>::size;
+  eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
+
+  using Mask = typename internal::unpacket_traits<Packet>::mask_t;
+  const Mask mask_max = std::numeric_limits<Mask>::max();
+
+  return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from));
+}
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // __NNFW_CKER_EIGEN_EIGEN_CONVOLUTION_HELPERS_H__
diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
new file mode 100644
index 000000000..92e1614d1
--- /dev/null
+++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
@@ -0,0 +1,1783 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_EIGEN_SPATIAL_CONVOLUTIONS_INL_H__
+#define __NNFW_CKER_EIGEN_EIGEN_SPATIAL_CONVOLUTIONS_INL_H__
+
+#include "cker/eigen/eigen_convolution_helpers.h"
+
+// Note this header is used in both TF and TFLite.
+namespace Eigen
+{
+
+namespace internal
+{
+
+// WARNING: Most of the code here implicitly assumes that the matrix is in
+// ColMajor layout. This is guaranteed by the tensor contraction (see
+// TensorContraction.h).
+//
+// Inside Eigen a tensor contraction is represented by a matrix multiplication.
+// We don't want to actually extract image patches and reshape the result into
+// a matrix (this involves allocating huge extra memory), so the patch
+// extraction and reshape operations are implicit.
+//
+// TensorContractionInputMapper takes a matrix index and returns the coefficient
+// (or the packet) of the "virtual tensor", that would be at that index if we
+// were to actually reshape the result of patch extraction.
+//
+// TensorContractionSubMapper provides a similar view into the "virtual matrix"
+// at the given vertical and horizontal offsets.
+//
+// "Virtual matrix" dimensions:
+//   *0: kernelChannels * kernelRows * kernelCols;
+//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
+//
+// *) extracted patches are continuous in memory (innermost dimension assuming
+//    col major layout)
+//
+// With this dimensions:
+//   row - offset within a single patch (in code: patchId)
+//   col - index of the extracted patch (in code: patchIndex)
+//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
+//
+// TODO(ezhulenev): Consolidate this part of the code with the image patch
+// extraction code since they are both very similar.
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device,
+          typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<
+    Scalar_, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+{
+public:
+  typedef Scalar_ Scalar;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      Self;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  typedef TensorEvaluator<ArgType, Device> TensorEvaluatorT;
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(
+      const TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device> &tensor,
+      const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
+      : m_impl(tensor.impl().impl())
+  {
+    Index patch_rows;
+    Index patch_depth;
+    if (internal::traits<ArgType>::Layout == ColMajor)
+    {
+      patch_depth = tensor.impl().dimensions()[0];
+      patch_rows = tensor.impl().dimensions()[1];
+      m_patch_cols = tensor.impl().dimensions()[2];
+      m_num_patches = tensor.impl().dimensions()[3];
+    }
+    else
+    {
+      const size_t NumDims = tensor.impl().dimensions().size();
+      patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      patch_rows = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
+    }
+
+    // Strides for navigating through the single patch.
+    m_patch_row_stride = patch_depth;
+    m_patch_col_stride = patch_rows * m_patch_row_stride;
+
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    m_colStride = patch_rows;
+
+    m_outputRows = tensor.impl().outputRows();
+    m_outputCols = tensor.impl().outputCols();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor)
+    {
+      m_inputRows = tensor.impl().impl().dimensions()[1];
+      m_inputCols = tensor.impl().impl().dimensions()[2];
+    }
+    else
+    {
+      const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
+    }
+
+    m_rowInputStride = patch_depth;
+    m_colInputStride = patch_depth * m_inputRows;
+    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
+
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastPatchRowStride = internal::TensorIntDivisor<Index>(m_patch_row_stride);
+    m_fastPatchColStride = internal::TensorIntDivisor<Index>(m_patch_col_stride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
+  }
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper)
+      : m_impl(base_mapper.m_impl)
+  {
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+
+    m_patch_row_stride = base_mapper.m_patch_row_stride;
+    m_patch_col_stride = base_mapper.m_patch_col_stride;
+
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_colStride = base_mapper.m_colStride;
+
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputRows = base_mapper.m_outputRows;
+    m_outputCols = base_mapper.m_outputCols;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
+    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+  }
+
+  // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const
+  {
+    return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 ||
+           m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const
+  {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const
+  {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const
+  {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual
+  // m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  // EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const
+  {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const
+  {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const
+  {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device> &impl() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+private:
+  friend class TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
+
+  // Load coefficient from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex,
+                                       Index otherIndex) const
+  {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol = (m_patch_col_inflate_strides == 1)
+                                   ? inputCol
+                                   : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow = (m_patch_row_inflate_strides == 1)
+                                   ? inputRow
+                                   : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
+        origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) ||
+        (inputRow != origInputRow * m_patch_row_inflate_strides))
+    {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex =
+        depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
+  // and `in_strides` equal to 1 (template specialization without templates).
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex,
+                                               Index otherIndex) const
+  {
+    eigen_assert(!nonStandardPatches());
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows)
+    {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex =
+        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // Load packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex,
+                                        Index otherIndex) const
+  {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    if (nonStandardPatches())
+    {
+      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+    }
+    typedef decltype(m_impl) TensorEvaluatorT;
+    return loadPacketStandard<Packet, TensorEvaluatorT>(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  // Helper function to load a 'partial' packet - this is the single column
+  // part of a packet that is split across two columns. In the 'partial' packet,
+  // the elements corresponding to the column (specified through colOffset) are
+  // loaded and the rest of the elements are zero-filled into the 'partial'
+  // packet. This function is called from loadPacketStandardFromTwoColumns().
+  // This code path is exercised only when the packet type supports masked load
+  // and when the partial packet load is available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(Index rowIndex, Index colIndex,
+                                                       Index otherIndex, Index patchId,
+                                                       const Index span[],
+                                                       const Index patchOffsets[],
+                                                       Index colOffset) const
+  {
+    const Index inputCol = colIndex + colOffset;
+    const Index rowOffsets[2] = {patchOffsets[0] - colOffset * m_colStride,
+                                 patchOffsets[1] - colOffset * m_colStride};
+    const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]};
+
+    if (inputRows[0] >= m_inputRows || inputRows[1] < 0 || inputCol >= m_inputCols || inputCol < 0)
+    {
+      // Partial packet is all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    else if (inputRows[0] >= 0 && inputRows[1] < m_inputRows)
+    {
+      // From inputIndex-span[0], we need to load elements starting from index
+      // span[0] all the way upto (and including) span[1].
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex =
+          depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      return m_impl.template partialPacket<Packet>(inputIndex - span[0],
+                                                   mask<Packet>(span[0], span[1] + 1));
+    }
+    else
+    {
+      // Using slow path for this partial packet.
+      // We need to load elements starting from index span[0] all the way upto
+      // (and including) span[1]. We split this load into 3 parts:
+      // 0 : span[0]-1 - Zeros will be loaded for these indices
+      // span[0] : span[1] - Elements will be loaded here for these indices
+      // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices
+      const Index packetSize = internal::unpacket_traits<Packet>::size;
+      EIGEN_ALIGN_MAX
+      typename internal::remove_const<Scalar>::type values[packetSize];
+      for (int i = 0; i < span[0]; ++i)
+        values[i] = Scalar(0);
+      for (int i = span[0]; i < span[1] + 1; ++i)
+        values[i] = loadCoeff(patchId - span[0] + i, rowIndex, colIndex, otherIndex);
+      for (int i = span[1] + 1; i < packetSize; ++i)
+        values[i] = Scalar(0);
+      return internal::pload<Packet>(values);
+    }
+  }
+
+  // Helper function to load a packet that is split across two columns.
+  // If required, this function is called from loadPacketStandard() when the
+  // packet type supports masked load and when the partial packet load is
+  // available in the TensorEvaluator.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromTwoColumns(Index patchId, Index rowIndex,
+                                                              Index colIndex, Index otherIndex,
+                                                              const Index patchOffsets[],
+                                                              const Index colOffsets[]) const
+  {
+    eigen_assert(colOffsets[1] == colOffsets[0] + 1);
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+
+    // Packet to load will be split into 2 parts where each part spans a single
+    // column. First determine where to split.
+    const Index patchIdSplit = ((colOffsets[1] * m_colStride) * m_rowInputStride) - 1;
+    const Index patchOffsetSplit = patchIdSplit / m_fastDimZero;
+
+    // patchIds[i]:          patchId corresponding to partial packet i
+    // spans[i]:             Start and end indices corresponding to the elements
+    //                       to be loaded for partial packet i
+    // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i
+    const Index patchIds[2] = {patchId, patchIdSplit + 1};
+    const Index spans[2][2] = {{0, patchIdSplit - patchId},
+                               {patchIdSplit - patchId + 1, packetSize - 1}};
+    const Index patchOffsets2Cols[2][2] = {{patchOffsets[0], patchOffsetSplit},
+                                           {patchOffsetSplit + 1, patchOffsets[1]}};
+
+    // Load partial packets and do bit-wise OR to generate required packet
+    return internal::por<Packet>(
+        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
+                                  patchOffsets2Cols[0], colOffsets[0]),
+        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
+                                  patchOffsets2Cols[1], colOffsets[1]));
+  }
+
+  // Helper function to load a packet that is present in a single columns.
+  // If required, this function is called from loadPacketStandard().
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumn(Index patchId, Index rowIndex,
+                                                                Index colIndex, Index otherIndex,
+                                                                const Index patchOffsets[],
+                                                                const Index colOffsets[],
+                                                                const Index inputCols[]) const
+  {
+    eigen_assert(colOffsets[0] == colOffsets[1]);
+    const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride,
+                                 patchOffsets[1] - colOffsets[1] * m_colStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]};
+
+    if (inputRows[0] >= m_inputRows || inputRows[1] < 0)
+    {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0)); // all zeros
+    }
+
+    if (inputRows[0] >= 0 && inputRows[1] < m_inputRows)
+    {
+      // no padding
+      const Index depth = patchId - patchOffsets[0] * patchDepth();
+      const Index inputIndex =
+          depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is not available
+  // for the TensorEvaluator or if the packet type does not support masked
+  // load.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+  loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
+  {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0)
+    {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    }
+
+    // Offsets and input calculation here are identical to
+    // loadCoeffStandard(...), but repeated twice.
+    const Index patchOffsets[2] = {patchId / m_fastDimZero,
+                                   (patchId + packetSize - 1) / m_fastDimZero};
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+    const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]};
+
+    if (inputCols[0] >= m_inputCols || inputCols[1] < 0)
+    {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    if (inputCols[0] == inputCols[1])
+    {
+      return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex, otherIndex,
+                                                patchOffsets, colOffsets, inputCols);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load standard packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  // This function will be called if partial packet loading is available for
+  // the TensorEvaluator and if the packet type supports masked load.
+  // The only difference between this and the other case is that if the packet
+  // to load is split across two columns, then in this case instead of going to
+  // the slow (element-by-element) load, we load two packets - each containing
+  // elements from one of the columns (rest of the elements of the packets are
+  // zeroes), and then combine these two packets to generate the required
+  // packet. The idea is to enable fast load (if possible) of these 'partial'
+  // packets.
+  template <typename PacketT, typename TensorEvaluatorT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+  loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
+  {
+    const Index packetSize = internal::unpacket_traits<PacketT>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0)
+    {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    }
+
+    // Offsets and input calculation here are identical to
+    // loadCoeffStandard(...), but repeated twice.
+    const Index patchOffsets[2] = {patchId / m_fastDimZero,
+                                   (patchId + packetSize - 1) / m_fastDimZero};
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+    const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]};
+
+    if (inputCols[0] >= m_inputCols || inputCols[1] < 0)
+    {
+      // all zeros
+      return internal::pset1<PacketT>(Scalar(0));
+    }
+    if (inputCols[0] == inputCols[1])
+    {
+      return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex, otherIndex,
+                                                patchOffsets, colOffsets, inputCols);
+    }
+    if (inputCols[1] == inputCols[0] + 1)
+    {
+      return loadPacketStandardFromTwoColumns(patchId, rowIndex, colIndex, otherIndex, patchOffsets,
+                                              colOffsets);
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex, Index colIndex,
+                                            Index otherIndex) const
+  {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+    eigen_assert((patchDepth() % packetSize) == 0);
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols || inputRow >= m_inputRows)
+    {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    // no padding
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex =
+        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex,
+                                                                      Index colIndex,
+                                                                      Index otherIndex) const
+  {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX
+    typename internal::remove_const<Scalar>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i)
+    {
+      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+  computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const
+  {
+    const size_t NumInputDims =
+        array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
+    const Index patch2DIndex =
+        (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+    otherIndex *= m_patchInputStride;
+    colIndex = patch2DIndex / m_fastOutputRows;
+    rowIndex = patch2DIndex - colIndex * m_outputRows;
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+  }
+
+  Index m_patch_cols;  // number of columns in the patch
+  Index m_num_patches; // number of patches to extract.
+
+  // Strides for navigating through the single patch.
+  Index m_patch_row_stride;
+  Index m_patch_col_stride;
+  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
+  internal::TensorIntDivisor<Index> m_fastPatchColStride;
+
+  Index m_patch_row_inflate_strides; // the strides for row inflation in the
+                                     // image patch
+  Index m_patch_col_inflate_strides; // the strides for col inflation in the
+                                     // image patch
+  // Fast representation of inflation strides.
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  Index m_otherStride;
+  Index m_colStride;
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;   // row stride in the input tensor
+  Index m_colInputStride;   // col stride in the input tensor
+  Index m_patchInputStride; // patch stride in the input tensor
+
+  Index m_inputRows; // Number of rows in the input tensor
+  Index m_inputCols; // Number of cols in the input tensor
+
+  Index m_outputRows; // Number of convolution output rows
+  Index m_outputCols; // Number of convolution output column
+
+  Index m_row_strides; // User specified row stride
+  Index m_col_strides; // User specified col stride
+
+  Index m_in_row_strides; // User specified input row stride
+  Index m_in_col_strides; // User specified input col stride
+
+  Index m_rowPaddingTop;  // Row padding
+  Index m_colPaddingLeft; // Column padding
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper<
+    Scalar, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+{
+public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      ParentMapper;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      Self;
+
+  typedef Self LinearMapper;
+
+  typedef typename ParentMapper::TensorEvaluatorT TensorEvaluatorT;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper,
+                                                                   Index vert_offset,
+                                                                   Index horiz_offset)
+      : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
+  {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper,
+                                                                   Index vert_offset,
+                                                                   Index horiz_offset)
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper)
+  {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const
+  {
+    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const
+  {
+    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const
+  {
+    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const
+  {
+    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset, j + m_col_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar loadCoeffStandard(Index i) const
+  {
+    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                           m_otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const
+  {
+    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const
+  {
+    typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
+    return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
+        i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+  }
+  template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { return m_base_mapper.nonStandardPatches(); }
+
+  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
+  // index respectively that fits into the peeled_k elements starting at
+  // m_depth_offset.
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const
+  {
+    const Index max_col =
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
+    return std::min<Index>(1 + max_col, patchCols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const
+  {
+    const Index max_row =
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
+        fastPatchRowStride();
+    return std::min<Index>(1 + max_row, patchRows());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col, Index row) const
+  {
+    const Index max_depth = m_depth_offset + peeled_k - //
+                            col * patchColStride() -    //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
+  // MaxDepth uses only the remaining number of elements in the peeled_k.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements, const Index start_depth) const
+  {
+    return std::min<Index>(start_depth + num_elements, patchDepth());
+  }
+
+  // Every register matters in this code, so sometimes to prevent register
+  // spilling, instead of the variable that you would expect to see, we use
+  // another one, that is guaranteed to have the same value. E.g. patch depth is
+  // always the same as input depth, and it's also the same as input row stride.
+  // Bunch of other parameters have similar relations.
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_base_mapper.m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_base_mapper.m_patch_cols; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRowStride() const
+  {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return patchDepth();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchColStride() const { return m_base_mapper.m_patch_col_stride; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const
+  {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return m_base_mapper.m_fastDimZero; // patch_depth
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const
+  {
+    return m_base_mapper.m_fastPatchColStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const
+  {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth, const Index baseIndex) const
+  {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
+  template <typename PacketT = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
+      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+  partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const
+  {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template partialPacket<PacketT>(inputIndex,
+                                                                mask<PacketT>(0, num_coeffs));
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool hasPadding() const
+  {
+    // TODO(ezhulenev): It does seems that for inflated filter it's still
+    // possible to guarantee "no padding or skipping" for non-standard packing.
+    if (nonStandardPatches())
+      return true;
+
+    // Non zero padding before.
+    if (m_base_mapper.m_rowPaddingTop > 0)
+      return true;
+    if (m_base_mapper.m_colPaddingLeft > 0)
+      return true;
+
+    // Non zero padding after in rows.
+    const Index last_row = (m_base_mapper.m_outputRows - 1) * m_base_mapper.m_row_strides;
+    if (last_row + (patchRows() - 1) >= m_base_mapper.m_inputRows)
+      return true;
+
+    // Non zero padding after in cols.
+    const Index last_col = (m_base_mapper.m_outputCols - 1) * m_base_mapper.m_col_strides;
+    if (last_col + (patchCols() - 1) >= m_base_mapper.m_inputCols)
+      return true;
+
+    return false;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const
+  {
+    const Index r = m_rowIndex + row;
+    return r < 0 || r >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row, const Index last_row) const
+  {
+    return m_rowIndex + first_row < 0 || m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padOrSkipRow(const Index row, Index *orig_row) const
+  {
+    eigen_assert(nonStandardPatches());
+
+    const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides;
+    *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1)
+                    ? input_row
+                    : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
+
+    return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) ||
+           (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const
+  {
+    const Index c = m_colIndex + col;
+    return c < 0 || c >= m_base_mapper.m_inputCols;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padOrSkipCol(const Index col, Index *orig_col) const
+  {
+    eigen_assert(nonStandardPatches());
+
+    const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides;
+    *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1)
+                    ? input_col
+                    : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
+
+    return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) ||
+           (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const
+  {
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return r * m_base_mapper.m_rowInputStride + c * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+  // Compute a base index when original input row and column were precomputed
+  // using padOrSkipRow and padOrSkipCol. Used only for non standard patches.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index origBaseIndex(const Index orig_row, const Index orig_col) const
+  {
+    return orig_row * m_base_mapper.m_rowInputStride + orig_col * m_base_mapper.m_colInputStride +
+           m_otherIndex;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const { return m_base_mapper.m_row_strides; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const { return m_base_mapper.m_col_strides; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const
+  {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return patchOffset - colOffset * m_base_mapper.m_colStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const
+  {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return colOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const { return m_depth_offset % patchDepth(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const
+  {
+    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
+  }
+
+private:
+  Index m_depth_offset; // First row in the input matrix
+  Index m_col_offset;   // First col in the input matrix
+
+  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
+  // indices for the first element in a patch specified by col_offset
+  // (see computeBaseIndices(...) for details).
+  Index m_rowIndex;
+  Index m_colIndex;
+  Index m_otherIndex;
+
+  const ParentMapper m_base_mapper; // Keeping a copy instead of a reference
+                                    // performs better in benchmarks.
+};
+
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted image patches) in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
+// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
+// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
+// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
+// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
+// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
+// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
+// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
+// A8 ...
+// ...
+//
+// *) A, B, C, ... - patches extracted from the original input.
+// *) A0, A1, A2 ... - values from the same patch at different offsets.
+//
+// The traversal (packed rhs memory) order (B0 besides A0 in memory):
+// A0 B0 C0 D0 A1 B1 C1 D1 ...
+// E0 F0 G0 H0 E1 F1 G1 H1 ...
+// ...
+// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
+//
+// This traversal order must be the same as in default gemm_pack_rhs defined in
+// GeneralBlockPanelKernel.h.
+//
+// *) nr - number of registers along the 'n' dimension.
+//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
+//    Multiplication" paper.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false>
+{
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar *block, const DataMapper &rhs, Index depth, Index cols,
+                                    Index stride = 0, Index offset = 0) const
+  {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+    (void)stride;
+    (void)offset;
+
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((packet_size % 4) == 0 && !non_standard_patches)
+      {
+        // FAST PATH:
+        // Iterate over patch columns and rows, if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0)
+        {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c)
+          {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // Check if we can squeeze reads along the `row` and `depth`
+            // dimensions (two innermost dimensions).
+            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&   //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) && //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) && //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) && //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1))
+            {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length = (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth = start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size)
+              {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r)
+            {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth =
+                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size)
+              {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+        }
+        else
+        {
+          for (; k < peeled_k; k += packet_size)
+          {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketStandard(k);
+            kernel.packet[1] = dm1.loadPacketStandard(k);
+            kernel.packet[2] = dm2.loadPacketStandard(k);
+            kernel.packet[3] = dm3.loadPacketStandard(k);
+            ptranspose(kernel);
+            pstoreu(block + 0 * packet_size, kernel.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel.packet[1]);
+            pstoreu(block + 2 * packet_size, kernel.packet[2]);
+            pstoreu(block + 3 * packet_size, kernel.packet[3]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!rhs.nonStandardPatches())
+      {
+        for (; k < depth; k++)
+        {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      }
+      else
+      {
+        for (; k < depth; k++)
+        {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++)
+      {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index, typename nocontract_t, typename contract_t,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+            Device>,
+        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false>
+{
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar *block, const DataMapper &rhs, Index depth, Index cols,
+                                    Index stride = 0, Index offset = 0) const
+  {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    (void)stride;
+    (void)offset;
+
+    const int packet_size = 2;
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches)
+      {
+        // FAST PATH:
+        // Iterate over patch columns and rows if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0)
+        {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c)
+          {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                               //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&   //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) && //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) && //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) && //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1))
+            {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length = (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth = start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size)
+              {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r)
+            {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth =
+                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size)
+              {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+        }
+        else
+        {
+          // Packet can span multiple rows or columns, so we have to go
+          // though the slower "standard" path.
+          for (; k < peeled_k; k += packet_size)
+          {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!non_standard_patches)
+      {
+        for (; k < depth; k++)
+        {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      }
+      else
+      {
+        for (; k < depth; k++)
+        {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++)
+      {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Special case for non-vectorized types such as float16.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device,
+          typename Scalar, typename Index, typename nocontract_t, typename contract_t,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+            Device>,
+        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false>
+{
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
+          Device>,
+      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar *block, const DataMapper &rhs, Index depth, Index cols,
+                                    Index stride = 0, Index offset = 0) const
+  {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    (void)offset;
+    (void)stride;
+
+    const Index packet_cols4 = (cols / 4) * 4;
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      if (!rhs.nonStandardPatches())
+      {
+        for (Index k = 0; k < depth; k++)
+        {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      }
+      else
+      {
+        for (Index k = 0; k < depth; k++)
+        {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2)
+    {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++)
+      {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+} // end namespace internal
+
+/** SpatialConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
+ * (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The input and the kernel must both be in col-major layout. The result will
+ * also be in col-major layout.
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
+ * pixels.
+ *
+ * If padding_top, padding_bottom, padding_left, or padding_right is specified,
+ * then those paddings will be used to pad the input, and padding_type must be
+ * PADDING_VALID.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
+ */
+template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+    internal::traits<Input>::Layout == ColMajor,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                                    const Kernel>,
+            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+            const OutputKernel>>,
+    TensorReshapingOp<
+        const DSizes<typename internal::traits<Input>::Index,
+                     internal::traits<Input>::NumDimensions>,
+        const TensorContractionOp<
+            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                                    const Kernel>,
+            const OutputKernel>>>::type
+SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1,
+                   const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME,
+                   const Index row_in_stride = 1, const Index col_in_stride = 1,
+                   const OutputKernel &output_kernel = OutputKernel(), Index padding_top = 0,
+                   Index padding_bottom = 0, Index padding_left = 0, Index padding_right = 0)
+{
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex>>
+      in(input);
+  TensorRef<
+      Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
+             internal::traits<Kernel>::Layout, TensorIndex>>
+      kern(kernel);
+
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE)
+  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const Index kernelRowsEff = kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const Index kernelColsEff = kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+  const bool padding_explicit = (padding_top || padding_bottom || padding_left || padding_right);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type)
+  {
+    case PADDING_VALID:
+    {
+      const TensorIndex InputRowsEff = InputRows + padding_top + padding_bottom;
+      const TensorIndex InputColsEff = InputCols + padding_left + padding_right;
+      out_height = divup(InputRowsEff - kernelRowsEff + 1, row_stride);
+      out_width = divup(InputColsEff - kernelColsEff + 1, col_stride);
+      break;
+    }
+    case PADDING_SAME:
+    {
+      eigen_assert(!padding_explicit);
+      out_height = divup(InputRows, row_stride);
+      out_width = divup(InputCols, col_stride);
+      break;
+    }
+    default:
+    {
+      // Initialize unused variables to avoid a compiler warning
+      out_height = 0;
+      out_width = 0;
+      eigen_assert(false && "unexpected padding");
+    }
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor)
+  {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i)
+    {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  }
+  else
+  {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i)
+    {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor)
+  {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i)
+    {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+  else
+  {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i)
+    {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor)
+  {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  }
+  else
+  {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  if (padding_explicit)
+  {
+    return choose(
+        Cond<internal::traits<Input>::Layout == ColMajor>(),
+        kernel.reshape(kernel_dims)
+            .contract(input
+                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                                 row_in_stride, col_in_stride,
+                                                 /*row_inflate_stride=*/1,
+                                                 /*col_inflate_stride=*/1, padding_top,
+                                                 padding_bottom, padding_left, padding_right,
+                                                 /*padding_value=*/0)
+                          .reshape(pre_contract_dims),
+                      contract_dims, output_kernel)
+            .reshape(post_contract_dims),
+        input
+            .extract_image_patches(
+                kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride,
+                /*row_inflate_stride=*/1,
+                /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right,
+                /*padding_value=*/0)
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+            .reshape(post_contract_dims));
+  }
+  else
+  {
+    return choose(
+        Cond<internal::traits<Input>::Layout == ColMajor>(),
+        kernel.reshape(kernel_dims)
+            .contract(input
+                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                                 row_in_stride, col_in_stride, padding_type)
+                          .reshape(pre_contract_dims),
+                      contract_dims, output_kernel)
+            .reshape(post_contract_dims),
+        input
+            .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
+                                   col_in_stride, padding_type)
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+            .reshape(post_contract_dims));
+  }
+}
+
+} // end namespace Eigen
+
+#endif // __NNFW_CKER_EIGEN_EIGEN_SPATIAL_CONVOLUTIONS_INL_H__
diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions.h
new file mode 100644
index 000000000..c6f1e2ee7
--- /dev/null
+++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EGIEN_EIGEN_SPATIAL_CONVOLUTIONS_H__
+#define __NNFW_CKER_EGIEN_EIGEN_SPATIAL_CONVOLUTIONS_H__
+
+//#define EIGEN_USE_CUSTOM_THREAD_POOL
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// Note the following header is used in both TF and TFLite. Particularly, it's
+// used for float TFLite Conv2D.
+#include "cker/eigen/eigen_spatial_convolutions-inl.h"
+
+#endif // __NNFW_CKER_EGIEN_EIGEN_SPATIAL_CONVOLUTIONS_H__
diff --git a/compute/cker/include/cker/eigen/eigen_tensor_reduced_instantiations_oss.h b/compute/cker/include/cker/eigen/eigen_tensor_reduced_instantiations_oss.h
new file mode 100644
index 000000000..87697e240
--- /dev/null
+++ b/compute/cker/include/cker/eigen/eigen_tensor_reduced_instantiations_oss.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This is essentially unsupported/CXX11/Eigen/Tensor.h
+// TODO(petewarden) - move this to a common location in Eigen itself.
+
+// clang-format off
+
+
+#ifndef __NNFW_CKER_EGIEN_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H__
+#define __NNFW_CKER_EGIEN_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H__
+
+
+#include "Eigen/Core"
+
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+
+
+
+
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#include <windows.h>
+#else
+#include <stdint.h>
+#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
+#include <random>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+#ifdef EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/ThreadPool"
+#endif
+
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "unsupported/Eigen/SpecialFunctions"
+#include "unsupported/Eigen/CXX11/src/util/CXX11Meta.h"
+#include "unsupported/Eigen/CXX11/src/util/MaxSizeVector.h"
+
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+
+#undef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)    \
+  if (this->m_lhs_inner_dim_contiguous &&                       \
+      this->m_rhs_inner_dim_contiguous &&                       \
+      !this->m_rhs_inner_dim_reordered) {                       \
+    METHOD<true, true, false, ALIGNMENT> ARGS;                  \
+  } else {                                                      \
+    eigen_assert(false && "Unsupported contraction formats");   \
+  }
+
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+#endif  // __NNFW_CKER_EGIEN_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H__
diff --git a/compute/cker/include/cker/gemmlowp/GEMMSupport.h b/compute/cker/include/cker/gemmlowp/GEMMSupport.h
new file mode 100644
index 000000000..76486eded
--- /dev/null
+++ b/compute/cker/include/cker/gemmlowp/GEMMSupport.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_GEMMLOWP_GEMM_SUPPORT_H__
+#define __NNFW_CKER_GEMMLOWP_GEMM_SUPPORT_H__
+
+#include <public/gemmlowp.h>
+
+#include <memory>
+#include <thread>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace gemm_support
+{
+
+struct GemmContext
+{
+  std::unique_ptr<gemmlowp::GemmContext> gemm_context;
+  constexpr static int default_num_threadpool_threads = 4;
+
+  GemmContext()
+  {
+    int num_threads = std::thread::hardware_concurrency() / 2;
+    if (num_threads == 0)
+    {
+      num_threads = default_num_threadpool_threads;
+    }
+
+    gemm_context.reset(new gemmlowp::GemmContext());
+    gemm_context->set_max_num_threads(num_threads);
+  }
+
+  static inline GemmContext &GetGemmLowpContext()
+  {
+    static GemmContext instance;
+    return instance;
+  }
+};
+
+inline gemmlowp::GemmContext *GetGemmLowpContext()
+{
+  auto &ctx = GemmContext::GetGemmLowpContext();
+  return ctx.gemm_context.get();
+}
+
+} // namespace gemm_support
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_GEMMLOWP_GEMM_SUPPORT_H__
diff --git a/compute/cker/include/cker/neon/neon_check.h b/compute/cker/include/cker/neon/neon_check.h
new file mode 100644
index 000000000..116f01bb7
--- /dev/null
+++ b/compute/cker/include/cker/neon/neon_check.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_NEON_CHECK_H__
+#define __NNFW_CKER_NEON_CHECK_H__
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+// Disable X86_NEON
+// #if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#if 0
+#define USE_NEON
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wsequence-point"
+#include "NEON_2_SSE.h"
+#pragma GCC diagnostic pop
+#endif
+
+// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
+// defined, PortableSomeFunc(args) otherwise.
+#ifdef USE_NEON
+// Always use Neon code
+#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
+
+#else
+// No NEON available: Use Portable code
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+
+#endif // defined(USE_NEON)
+
+#endif // __NNFW_CKER_NEON_CHECK_H__
diff --git a/compute/cker/include/cker/operation/AddN.h b/compute/cker/include/cker/operation/AddN.h
new file mode 100644
index 000000000..1704da641
--- /dev/null
+++ b/compute/cker/include/cker/operation/AddN.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ADDN_H__
+#define __NNFW_CKER_ADDN_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+void AddN(const Shape &input_shape, const size_t num_inputs, const T **input_data, T *output_data)
+{
+  const size_t size = input_shape.FlatSize();
+  for (size_t i = 0; i < size; ++i)
+  {
+    T x = 0;
+    for (size_t j = 0; j < num_inputs; ++j)
+    {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ADDN_H__
diff --git a/compute/cker/include/cker/operation/ArgMinMax.h b/compute/cker/include/cker/operation/ArgMinMax.h
new file mode 100644
index 000000000..f7a06d74b
--- /dev/null
+++ b/compute/cker/include/cker/operation/ArgMinMax.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ARGMINMAX_H__
+#define __NNFW_CKER_ARGMINMAX_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T1, typename T2, typename Cmp>
+void ArgMinMax(const Shape &input1_shape, const T1 *input1_data, const Shape &output_shape,
+               T2 *output_data, int32_t axis, const Cmp &cmp)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input1_shape.DimensionsCount() > 0);
+  assert(input1_shape.DimensionsCount() - 1 == output_shape.DimensionsCount());
+  if (axis < 0)
+  {
+    axis += input1_shape.DimensionsCount();
+  }
+  const int axis_size = input1_shape.Dims(axis);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    assert(input1_shape.Dims(i) == output_shape.Dims(i));
+    outer_size *= input1_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  const int dims_count = input1_shape.DimensionsCount();
+  for (int i = axis + 1; i < dims_count; ++i)
+  {
+    assert(input1_shape.Dims(i) == output_shape.Dims(i - 1));
+    inner_size *= input1_shape.Dims(i);
+  }
+  for (int outer = 0; outer < outer_size; ++outer)
+  {
+    for (int inner = 0; inner < inner_size; ++inner)
+    {
+      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
+      T2 min_max_index = 0;
+      for (int i = 1; i < axis_size; ++i)
+      {
+        const auto &curr_value = input1_data[(outer * axis_size + i) * inner_size + inner];
+        if (cmp(curr_value, min_max_value))
+        {
+          min_max_value = curr_value;
+          min_max_index = static_cast<T2>(i);
+        }
+      }
+      output_data[outer * inner_size + inner] = min_max_index;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ARGMINMAX_H__
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
new file mode 100644
index 000000000..6149cafa7
--- /dev/null
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_AVERAGE_POOL_H__
+#define __NNFW_CKER_AVERAGE_POOL_H__
+
+#include "cker/neon/neon_check.h"
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+// TODO Change to apply neon for this function if it is faster
+template <typename T>
+void AveragePool(const PoolParams &, const Shape &, const T *, const Shape &, T *)
+{
+  static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value,
+                "cker::MaxPool : This function supports only integer or floating point");
+  throw std::runtime_error("cker::AveragePool : Unsupported data type");
+}
+
+template <>
+void AveragePool<float>(const PoolParams &params, const Shape &input_shape, const float *input_data,
+                        const Shape &output_shape, float *output_data)
+{
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  // TODO(benoitjacob) make this a proper reference impl without Eigen!
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  // TODO(benoitjacob) get rid of the dynamic memory allocation here!
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int h = 0; h < input_height; ++h)
+    {
+      for (int w = 0; w < input_width; ++w)
+      {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + params.padding_values.height;
+        int wpad = w + params.padding_values.width;
+        int h_start =
+            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start =
+            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph)
+        {
+          for (int pw = w_start; pw < w_end; ++pw)
+          {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+  // Divide the output by the actual number of elements being averaged over
+  assert(out_count.minCoeff() > 0);
+  out_mat.array().rowwise() /= out_count.transpose().array();
+
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
+                                                  params.float_activation_max);
+  }
+}
+
+inline void AveragePool16(const PoolParams &params, const Shape &input_shape,
+                          const uint8_t *input_data, const Shape &output_shape,
+                          uint8_t *output_data)
+{
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  assert(params.quantized_activation_min <= params.quantized_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  uint16_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+    {
+      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8_t *input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++)
+          {
+            const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++)
+            {
+              const uint8_t *input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16)
+              {
+                uint16x8_t acc_reg[2];
+                for (int i = 0; i < 2; i++)
+                {
+                  acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+                }
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+                acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+                for (int i = 0; i < 2; i++)
+                {
+                  vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8)
+              {
+                uint16x8_t acc_reg = vld1q_u16(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vaddw_u8(acc_reg, input_reg);
+                vst1q_u16(acc + channel, acc_reg);
+              }
+#endif
+              for (; channel < tranche_depth; ++channel)
+              {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
+  if (filter_count == FILTER_COUNT)                                     \
+  {                                                                     \
+    for (; channel <= tranche_depth - 8; channel += 8)                  \
+    {                                                                   \
+      uint16_t buf[8];                                                  \
+      for (int i = 0; i < 8; i++)                                       \
+      {                                                                 \
+        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
+      }                                                                 \
+      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                      \
+      buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
+      buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
+      vst1_u8(output_ptr + channel, buf8);                              \
+    }                                                                   \
+  }
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+          for (; channel <= tranche_depth - 8; channel += 8)
+          {
+            uint16_t buf[8];
+            for (int i = 0; i < 8; i++)
+            {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel)
+          {
+            uint8_t a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16_t>(a, params.quantized_activation_min);
+            a = std::min<uint16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void AveragePool32(const PoolParams &params, const Shape &input_shape,
+                          const uint8_t *input_data, const Shape &output_shape,
+                          uint8_t *output_data)
+{
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  assert(params.quantized_activation_min <= params.quantized_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  uint32_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+    {
+      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8_t *input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++)
+          {
+            const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++)
+            {
+              const uint8_t *input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16)
+              {
+                uint16x4_t acc_reg[4];
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg)));
+                acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg)));
+                acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg)));
+                acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg)));
+                for (int i = 0; i < 4; i++)
+                {
+                  vst1q_u32(acc + channel + 4 * i,
+                            vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8)
+              {
+                uint16x4_t acc_reg[2];
+                uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_u16(input_reg);
+                acc_reg[1] = vget_high_u16(input_reg);
+                for (int i = 0; i < 2; i++)
+                {
+                  vst1q_u32(acc + channel + 4 * i,
+                            vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel)
+              {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
+  if (filter_count == FILTER_COUNT)                                     \
+  {                                                                     \
+    for (; channel <= tranche_depth - 8; channel += 8)                  \
+    {                                                                   \
+      uint16_t buf[8];                                                  \
+      for (int i = 0; i < 8; i++)                                       \
+      {                                                                 \
+        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
+      }                                                                 \
+      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                      \
+      buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
+      buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
+      vst1_u8(output_ptr + channel, buf8);                              \
+    }                                                                   \
+  }
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+          for (; channel <= tranche_depth - 8; channel += 8)
+          {
+            uint16_t buf[8];
+            for (int i = 0; i < 8; i++)
+            {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel)
+          {
+            uint16_t a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16_t>(a, params.quantized_activation_min);
+            a = std::min<uint16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <>
+void AveragePool<uint8_t>(const PoolParams &params, const Shape &input_shape,
+                          const uint8_t *input_data, const Shape &output_shape,
+                          uint8_t *output_data)
+{
+  if (params.filter_height * params.filter_width > 16 * 16)
+  {
+    AveragePool32(params, input_shape, input_data, output_shape, output_data);
+  }
+  else
+  {
+    AveragePool16(params, input_shape, input_data, output_shape, output_data);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/BatchMatMul.h b/compute/cker/include/cker/operation/BatchMatMul.h
new file mode 100644
index 000000000..18070982a
--- /dev/null
+++ b/compute/cker/include/cker/operation/BatchMatMul.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BATCH_MATMUL_H__
+#define __NNFW_CKER_BATCH_MATMUL_H__
+
+#include "Transpose.h"
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/operation/reference/BatchMatMul.h"
+
+#include <vector>
+
+namespace nnfw
+{
+namespace cker
+{
+
+class BatchMatMul
+{
+public:
+  BatchMatMul()
+  {
+    // DO NOTHING
+  }
+
+  /**
+   * @brief   Prepare temporary area for calculation
+   */
+  void prepare(const Shape &lhs_shape, const Shape &rhs_shape, bool adj_x, bool adj_y)
+  {
+    if (adj_x)
+    {
+      int32_t rank = lhs_shape.DimensionsCount();
+      _temp_lhs_shape.Resize(rank);
+
+      for (int32_t i = 0; i < rank - 2; i++)
+      {
+        _temp_lhs_shape.SetDim(i, lhs_shape.Dims(i));
+      }
+      _temp_lhs_shape.SetDim(rank - 2, lhs_shape.Dims(rank - 1));
+      _temp_lhs_shape.SetDim(rank - 1, lhs_shape.Dims(rank - 2));
+
+      _temp_lhs.resize(_temp_lhs_shape.FlatSize());
+    }
+
+    if (!adj_y)
+    {
+      int32_t rank = rhs_shape.DimensionsCount();
+      _temp_rhs_shape.Resize(rank);
+
+      for (int32_t i = 0; i < rank - 2; i++)
+      {
+        _temp_rhs_shape.SetDim(i, rhs_shape.Dims(i));
+      }
+      _temp_rhs_shape.SetDim(rank - 2, rhs_shape.Dims(rank - 1));
+      _temp_rhs_shape.SetDim(rank - 1, rhs_shape.Dims(rank - 2));
+
+      _temp_rhs.resize(_temp_rhs_shape.FlatSize());
+    }
+  }
+
+  void operator()(const Shape &lhs_shape, const float *lhs_data, const Shape &rhs_shape,
+                  const float *rhs_data, bool adj_x, bool adj_y, const Shape &output_shape,
+                  float *output_data)
+  {
+    // Assume lhs and rhs is not constant
+    // TODO Handle constant input
+
+    if (!adj_y)
+    {
+      transposeRowsCols(rhs_shape, rhs_data, _temp_rhs_shape, _temp_rhs.data());
+    }
+
+    if (adj_x)
+    {
+      transposeRowsCols(lhs_shape, lhs_data, _temp_lhs_shape, _temp_lhs.data());
+    }
+
+    Shape new_lhs_shape = adj_x ? lhs_shape : swapRowColDims(lhs_shape);
+    Shape new_rhs_shape = adj_y ? rhs_shape : swapRowColDims(rhs_shape);
+    const float *new_lhs_data = adj_x ? _temp_lhs.data() : lhs_data;
+    const float *new_rhs_data = adj_y ? rhs_data : _temp_rhs.data();
+
+    // Note we pass RHS args first, LHS args second
+    // Check accumulative dimensions of lhs and rhs of are equal
+    assert(Shape::ExtendedShape(5, new_rhs_shape).Dims(4) ==
+           Shape::ExtendedShape(5, new_lhs_shape).Dims(3));
+    reference::BatchMatMul(new_rhs_shape, new_rhs_data, new_lhs_shape, new_lhs_data, output_shape,
+                           output_data);
+  }
+
+private:
+  Shape swapRowColDims(const Shape &shape)
+  {
+    Shape swapped_shape(shape);
+    const uint32_t dims = shape.DimensionsCount();
+    swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1));
+    swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2));
+
+    return swapped_shape;
+  }
+
+  void transposeRowsCols(const Shape &input_shape, const float *input_data,
+                         const Shape &output_shape, float *output_data)
+  {
+    TransposeParams params;
+    int rank = input_shape.DimensionsCount();
+    params.perm_count = rank;
+    for (int i = 0; i < 2; i++)
+    {
+      params.perm[i] = i;
+    }
+    params.perm[rank - 2] = rank - 1;
+    params.perm[rank - 1] = rank - 2;
+
+    Transpose<float>(params, input_shape, input_data, output_shape, output_data);
+  }
+
+private:
+  std::vector<float> _temp_lhs;
+  Shape _temp_lhs_shape;
+  std::vector<float> _temp_rhs;
+  Shape _temp_rhs_shape;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BATCH_MATMUL_H__
diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h
new file mode 100644
index 000000000..e33b2fba5
--- /dev/null
+++ b/compute/cker/include/cker/operation/BatchToSpaceND.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BATCH_TO_SPACE_ND_H__
+#define __NNFW_CKER_BATCH_TO_SPACE_ND_H__
+
+#include "cker/Shape.h"
+
+#define UNUSED(x) ((void)(x))
+
+namespace nnfw
+{
+namespace cker
+{
+
+// Helper methods for BatchToSpaceND.
+// `spatial_index_dim` specifies post-crop offset index in this spatial
+// dimension, i.e. spatial offset introduced by flattening batch to spatial
+// dimension minus the crop size at beginning. `block_shape_dim` is the block
+// size in current dimension. `input_dim` and `output_dim` are input and output
+// size of BatchToSpaceND operation in current dimension.
+// Output start index is inclusive and end index is exclusive.
+inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_dim, int output_dim,
+                          int *start_index, int *end_index)
+{
+  // (*start_index) * block_shape_dim is effectively rounded up to the next
+  // multiple of block_shape_dim by the integer division.
+  *start_index = std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
+  // end_index is exclusive).
+  *end_index =
+      std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+}
+
+template <typename T>
+inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1_data,
+                           const int32_t *block_shape_data, const int32_t *crops_data,
+                           const Shape &unextended_output_shape, T *output_data)
+{
+  auto input_dim = unextended_input1_shape.DimensionsCount();
+  auto output_dim = unextended_output_shape.DimensionsCount();
+
+  assert(input_dim == 3 || input_dim == 4);
+  assert(input_dim == output_dim);
+
+  UNUSED(input_dim);
+  UNUSED(output_dim);
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  auto extend_shape = [](const Shape &shape) {
+    if (shape.DimensionsCount() == 4)
+    {
+      return shape;
+    }
+    Shape new_shape(4, 1);
+    new_shape.SetDim(0, shape.Dims(0));
+    new_shape.SetDim(1, shape.Dims(1));
+    new_shape.SetDim(3, shape.Dims(2));
+    return new_shape;
+  };
+  const Shape input1_shape = extend_shape(unextended_input1_shape);
+  const Shape output_shape = extend_shape(unextended_output_shape);
+
+  const int32_t output_width = output_shape.Dims(2);
+  const int32_t output_height = output_shape.Dims(1);
+  const int32_t output_batch_size = output_shape.Dims(0);
+
+  const int32_t depth = input1_shape.Dims(3);
+  const int32_t input_width = input1_shape.Dims(2);
+  const int32_t input_height = input1_shape.Dims(1);
+  const int32_t input_batch_size = input1_shape.Dims(0);
+
+  const int32_t block_shape_height = block_shape_data[0];
+  const int32_t block_shape_width = block_shape_data[1];
+
+  const int32_t crops_top = crops_data[0];
+  const int32_t crops_left = crops_data[2];
+
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch)
+  {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+
+    int in_h_start = 0;
+    int in_h_end = 0;
+    // GetIndexRange ensures start and end indices are in [0, output_height).
+    GetIndexRange(spatial_offset / block_shape_width - crops_top, block_shape_height, input_height,
+                  output_height, &in_h_start, &in_h_end);
+
+    for (int in_h = in_h_start; in_h < in_h_end; ++in_h)
+    {
+      const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top;
+      assert(out_h >= 0);
+      assert(out_h < output_height);
+
+      int in_w_start = 0;
+      int in_w_end = 0;
+      // GetIndexRange ensures start and end indices are in [0, output_width).
+      GetIndexRange(spatial_offset % block_shape_width - crops_left, block_shape_width, input_width,
+                    output_width, &in_w_start, &in_w_end);
+
+      for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
+      {
+        const int out_w =
+            in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
+        assert(out_w >= 0);
+        assert(out_w < output_width);
+        T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T *in = input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BATCH_TO_SPACE_ND_H__
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
new file mode 100644
index 000000000..d9917a9da
--- /dev/null
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
+#define __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
+
+#include <functional>
+#include "cker/operation/optimized/BinaryArithmeticOps.h"
+#include "cker/operation/reference/BinaryArithmeticOps.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace
+{
+template <BinaryArithmeticOpType op_type, typename T>
+const std::function<T(const T &, const T &)> GetBinaryArtithmeticFn()
+{
+  switch (op_type)
+  {
+    case BinaryArithmeticOpType::ADD:
+    {
+      return [](const T &a, const T &b) -> T { return a + b; };
+    }
+    case BinaryArithmeticOpType::MUL:
+    {
+      return [](const T &a, const T &b) -> T { return a * b; };
+    }
+    case BinaryArithmeticOpType::SUB:
+    {
+      return [](const T &a, const T &b) -> T { return a - b; };
+    }
+    case BinaryArithmeticOpType::DIV:
+    {
+      if (std::is_floating_point<T>::value)
+        return [](const T &a, const T &b) -> T { return a / b; };
+      else
+        return [](const T &a, const T &b) -> T {
+          if (b == 0)
+            throw std::runtime_error("Divide by zero");
+          return a / b;
+        };
+    }
+    case BinaryArithmeticOpType::POW:
+    {
+      return [](const T &a, const T &b) -> T { return std::pow(a, b); };
+    }
+    default:
+    {
+      assert(false);
+      return nullptr;
+    }
+  }
+}
+} // namespace
+
+// Consolidates dimensions in broadcast inputs, checks for five-fold pattern.
+//
+// For example, if sequence of dimensions of one input is
+// ..., 1, 3, 1, 7, 9, 5,... and the other is ..., 2, 3, 1, 7, 1, 1, ...
+// we can consolidate these as
+// ..., 1, 3*7, 9*5, ... and 2, 3*7, 1.
+//
+// The category is updated in the less-frequent case of shapes that are
+// not suited to a fivefold-loop broadcast.
+//
+// Falls back to generic pattern when it does not know how to process properly.
+//
+// Returns true iff there is some sort of broadcast, which includes five-fold
+// patterns and falling back to generic broadcast.
+inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
+                                   BinaryArithmeticOpParam *params)
+{
+  const int dims_count = std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  Shape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = Shape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = Shape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1)
+  {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i)
+  {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i))
+    {
+      continue;
+    }
+    else if (extended_shape0.Dims(i) == 1)
+    {
+      params->broadcast_category = BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    }
+    else if (extended_shape1.Dims(i) == 1)
+    {
+      params->broadcast_category = BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    }
+    else
+    {
+      // This case is erroneous: there is a dimension that does not match and
+      // is not a broadcast from one shape to the other.
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      return true;
+    }
+  }
+
+  if (params->broadcast_category != BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category != BroadcastableOpCategory::kSecondInputBroadcastsFast)
+  {
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs =
+      params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
+  const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
+  {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1)
+  {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
+  {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1)
+  {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
+  {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0)
+  {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
+}
+
+template <BinaryArithmeticOpType op_type, typename T>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                               const T *input1_data, const Shape &input2_shape,
+                               const T *input2_data, const Shape &output_shape, T *output_data)
+{
+  reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
+                                output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
+}
+
+template <BinaryArithmeticOpType op_type>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                               const uint8_t *input1_data, const Shape &input2_shape,
+                               const uint8_t *input2_data, const Shape &output_shape,
+                               uint8_t *output_data)
+{
+  switch (op_type)
+  {
+    case nnfw::cker::BinaryArithmeticOpType::ADD:
+    case nnfw::cker::BinaryArithmeticOpType::SUB:
+      optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data,
+                           output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::MUL:
+      optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
+                           const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::DIV:
+      throw std::runtime_error{"Quant8 Asymm NYI"};
+
+    default:
+      assert(false);
+      break;
+  }
+}
+
+template <BinaryArithmeticOpType op_type>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                               const float *input1_data, const Shape &input2_shape,
+                               const float *input2_data, const Shape &output_shape,
+                               float *output_data)
+{
+  // Supported type is only float now
+  switch (op_type)
+  {
+    case nnfw::cker::BinaryArithmeticOpType::ADD:
+      optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::MUL:
+      optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::SUB:
+      optimized::Sub(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::DIV:
+      optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
+      break;
+    default:
+      assert(false);
+      break;
+  }
+}
+
+template <BinaryArithmeticOpType op_type, typename T>
+inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                        const T *input1_data, const Shape &input2_shape,
+                                        const T *input2_data, const Shape &output_shape,
+                                        T *output_data)
+{
+  reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                             input2_data, output_shape, output_data,
+                                             GetBinaryArtithmeticFn<op_type, T>());
+}
+
+template <BinaryArithmeticOpType op_type>
+inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                        const uint8_t *input1_data, const Shape &input2_shape,
+                                        const uint8_t *input2_data, const Shape &output_shape,
+                                        uint8_t *output_data)
+{
+  switch (op_type)
+  {
+    case nnfw::cker::BinaryArithmeticOpType::ADD:
+    case nnfw::cker::BinaryArithmeticOpType::SUB:
+      optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::MUL:
+      optimized::BroadcastMulDispatchQuant8(
+          params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
+          const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::DIV:
+    case nnfw::cker::BinaryArithmeticOpType::POW:
+      throw std::runtime_error{"Quant8 Asymm NYI"};
+    default:
+      assert(false);
+      break;
+  }
+}
+
+template <BinaryArithmeticOpType op_type>
+inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                        const float *input1_data, const Shape &input2_shape,
+                                        const float *input2_data, const Shape &output_shape,
+                                        float *output_data)
+{
+  // Supported type is only float now
+  switch (op_type)
+  {
+    case nnfw::cker::BinaryArithmeticOpType::ADD:
+      optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::MUL:
+      optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::SUB:
+      optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::DIV:
+      optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+      break;
+    case nnfw::cker::BinaryArithmeticOpType::POW:
+      reference::BroadcastBinaryArithmeticOpSlow<float>(
+          params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+          GetBinaryArtithmeticFn<op_type, float>());
+      break;
+    default:
+      assert(false);
+      break;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h
new file mode 100644
index 000000000..5068eca96
--- /dev/null
+++ b/compute/cker/include/cker/operation/BroadcastTo.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BROADCAST_TO_H__
+#define __NNFW_CKER_BROADCAST_TO_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+
+#include "cker/operation/Helper/Tensor.h"
+#include "cker/operation/Helper/BCast.h"
+
+#include <vector>
+
+#define UNUSED(x) (void)(x)
+
+namespace nnfw
+{
+namespace cker
+{
+namespace functor
+{
+static const int32_t kint32max = ((int32_t)0x7FFFFFFF);
+
+template <typename Device, typename T> struct FillFunctor
+{
+  // Computes on device "d": out = out.constant(in(0)),
+  void operator()(const Device &d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in);
+};
+
+template <typename T> struct FillFunctor<Eigen::ThreadPoolDevice, T>
+{
+  void operator()(const Eigen::ThreadPoolDevice &d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in)
+  {
+    out.device(d) = out.constant(in());
+  }
+};
+
+template <typename Device, typename T> struct BroadcastTo
+{
+  template <int NDIMS>
+  void DoBCast32Bit(const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+                    typename TTypes<T, NDIMS>::ConstTensor in,
+                    const typename Eigen::array<int, NDIMS> &bcast) const
+  {
+    To32Bit(out).device(device) = To32Bit(in).broadcast(bcast);
+  }
+
+  template <int NDIMS>
+  void DoBCast(const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+               typename TTypes<T, NDIMS>::ConstTensor in,
+               const typename Eigen::array<Eigen::DenseIndex, NDIMS> &bcast) const
+  {
+    out.device(device) = in.broadcast(bcast);
+  }
+
+  template <int NDIMS>
+  void ReshapeAndBCast(const Device &device, Tensor &output_tensor, const Tensor &input_tensor,
+                       const BCast &bcast) const
+  {
+    const bool can_use_32bit = std::is_same<Eigen::GpuDevice, Device>::value &&
+                               output_tensor.shape.FlatSize() < kint32max &&
+                               input_tensor.shape.FlatSize() < kint32max;
+    if (can_use_32bit)
+    {
+      DoBCast32Bit<NDIMS>(device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+                          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+                          BCast::ToIndexArrayType<int, NDIMS>(bcast.x_bcast()));
+    }
+    else
+    {
+      DoBCast<NDIMS>(device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+                     input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+                     BCast::ToIndexArrayType<Eigen::DenseIndex, NDIMS>(bcast.x_bcast()));
+    }
+  }
+
+  // PRECONDITION: rank(input_shape) > 0 &&
+  //               rank(input_shape) <= rank(output_shape)  &&
+  //               output_shape.num_elements() > 0.
+  void operator()(const Device &device, Tensor &output_tensor, const Shape &output_shape,
+                  const Tensor &input_tensor, const Shape &input_shape, const BCast &bcast) const
+  {
+    const int ndims = bcast.y_reshape().size();
+    switch (ndims)
+    {
+      case 1:
+        ReshapeAndBCast<1>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 2:
+        ReshapeAndBCast<2>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 3:
+        ReshapeAndBCast<3>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 4:
+        ReshapeAndBCast<4>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 5:
+        ReshapeAndBCast<5>(device, output_tensor, input_tensor, bcast);
+        break;
+      default:
+        // NOTE : UNUSED leaves for maintenance purposes.
+        UNUSED(output_shape);
+        UNUSED(input_shape);
+        break;
+    }
+  }
+};
+} // functor
+
+template <typename T>
+inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape,
+                        T *output_data)
+{
+  const int input_flatsize = input_shape.FlatSize();
+
+  if (input_shape == output_shape)
+  {
+    memcpy(output_data, input_data, input_flatsize * sizeof(T));
+    return;
+  }
+
+  // Input shape's rank must be no greater than rank of output shape.
+  assert(input_shape.DimensionsCount() <= output_shape.DimensionsCount());
+
+  // It shouldn't be 0.
+  assert(output_shape.DimensionsCount());
+
+  Tensor output_tensor;
+  Tensor input_tensor;
+
+  input_tensor.shape.ReplaceWith(input_shape.DimensionsCount(), input_shape.DimsData());
+  input_tensor.buffer = input_data;
+
+  output_tensor.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
+  output_tensor.buffer = output_data;
+
+  const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
+
+  // Handle broadcast from Scalar.
+  if (input_flatsize == 0)
+  {
+    functor::FillFunctor<Eigen::ThreadPoolDevice, T>()(device, output_tensor.flat<T>(),
+                                                       input_tensor.scalar<T>());
+  }
+
+  BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
+              /*fewer_dims_optimization=*/true);
+
+  // Predict TRUE.
+  assert(bcast.IsValid());
+  // should be same.
+  assert(BCast::ToShape(bcast.output_shape()) == output_shape);
+
+  functor::BroadcastTo<Eigen::ThreadPoolDevice, T>()(device, output_tensor, output_shape,
+                                                     input_tensor, input_shape, bcast);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BROADCAST_TO_H__
diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h
new file mode 100644
index 000000000..d69b38aca
--- /dev/null
+++ b/compute/cker/include/cker/operation/Common.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_COMMON_H__
+#define __NNFW_CKER_COMMON_H__
+
+#include "cker/neon/neon_check.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data,
+                         int array_size, float *array_data)
+{
+  // Note: see b/132215220: in May 2019 we thought it would be OK to replace
+  // this with the Eigen one-liner:
+  //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
+  // This turned out to severely regress performance: +4ms (i.e. 8%) on
+  // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
+  assert((array_size % bias_size) == 0);
+#ifdef USE_NEON
+  float *array_ptr = array_data;
+  float *array_end_ptr = array_ptr + array_size;
+  const auto clamp_min_vec = vdupq_n_f32(clamp_min);
+  const auto clamp_max_vec = vdupq_n_f32(clamp_max);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size)
+  {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16)
+    {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      x0 = vmaxq_f32(clamp_min_vec, x0);
+      x1 = vmaxq_f32(clamp_min_vec, x1);
+      x2 = vmaxq_f32(clamp_min_vec, x2);
+      x3 = vmaxq_f32(clamp_min_vec, x3);
+      x0 = vminq_f32(clamp_max_vec, x0);
+      x1 = vminq_f32(clamp_max_vec, x1);
+      x2 = vminq_f32(clamp_max_vec, x2);
+      x3 = vminq_f32(clamp_max_vec, x3);
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4)
+    {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      x = vmaxq_f32(clamp_min_vec, x);
+      x = vminq_f32(clamp_max_vec, x);
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++)
+    {
+      array_ptr[i] =
+          ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
+    }
+  }
+#else // not NEON
+  for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
+  {
+    for (int i = 0; i < bias_size; i++)
+    {
+      array_data[array_offset + i] = ActivationFunctionWithMinMax(
+          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+    }
+  }
+#endif
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_COMMON_H__
diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h
new file mode 100644
index 000000000..47eb6034c
--- /dev/null
+++ b/compute/cker/include/cker/operation/Comparison.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_COMPARISON_H__
+#define __NNFW_CKER_COMPARISON_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T> inline bool EqualFn(T lhs, T rhs) { return lhs == rhs; }
+template <typename T> inline bool NotEqualFn(T lhs, T rhs) { return lhs != rhs; }
+template <typename T> inline bool GreaterFn(T lhs, T rhs) { return lhs > rhs; }
+template <typename T> inline bool GreaterEqualFn(T lhs, T rhs) { return lhs >= rhs; }
+template <typename T> inline bool LessFn(T lhs, T rhs) { return lhs < rhs; }
+template <typename T> inline bool LessEqualFn(T lhs, T rhs) { return lhs <= rhs; }
+
+template <typename T> using ComparisonFn = bool (*)(T, T);
+
+template <typename T, ComparisonFn<T> F>
+inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data,
+                           const Shape &input2_shape, const T *input2_data,
+                           const Shape &output_shape, bool *output_data)
+{
+  const int64_t flatsize = // number of data....
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    output_data[i] = F(input1_data[i], input2_data[i]);
+  }
+}
+
+template <ComparisonFn<float> F>
+inline void Comparison(const Shape &input1_shape, const float *input1_data,
+                       const Shape &input2_shape, const float *input2_data,
+                       const Shape &output_shape, bool *output_data)
+{
+  ComparisonImpl<float, F>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                           output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void ComparisonWithScaling(ComparisonParams &params, const Shape &input1_shape,
+                                  const T *input1_data, const Shape &input2_shape,
+                                  const T *input2_data, const Shape &output_shape,
+                                  bool *output_data)
+{
+  int left_shift = params.left_shift;
+  int32_t input1_offset = params.input1_offset;
+  int32_t input1_multiplier = params.input1_multiplier;
+  int input1_shift = params.input1_shift;
+  int32_t input2_offset = params.input2_offset;
+  int32_t input2_multiplier = params.input2_multiplier;
+  int input2_shift = params.input2_shift;
+  const int64_t flatsize = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    const int32_t input1_val = input1_offset + input1_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input1_val, input1_multiplier, input1_shift);
+    const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input2_val, input2_multiplier, input2_shift);
+    output_data[i] = F(scaled_input1_val, scaled_input2_val);
+  }
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void
+BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *input1_data,
+                              const Shape &unextended_input2_shape, const T *input2_data,
+                              const Shape &unextended_output_shape, bool *output_data)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(output_shape, b, y, x, c)] =
+              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison4DSlow(const Shape &input1_shape, const T *input1_data,
+                                      const Shape &input2_shape, const T *input2_data,
+                                      const Shape &output_shape, bool *output_data)
+{
+  BroadcastComparison4DSlowImpl<T, F>(input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
+                                                 const Shape &input1_shape, const T *input1_data,
+                                                 const Shape &input2_shape, const T *input2_data,
+                                                 const Shape &output_shape, bool *output_data)
+{
+  assert(input1_shape.DimensionsCount() <= 4);
+  assert(input2_shape.DimensionsCount() <= 4);
+  assert(output_shape.DimensionsCount() <= 4);
+
+  int left_shift = params.left_shift;
+  int32_t input1_offset = params.input1_offset;
+  int32_t input1_multiplier = params.input1_multiplier;
+  int input1_shift = params.input1_shift;
+  int32_t input2_offset = params.input2_offset;
+  int32_t input2_multiplier = params.input2_multiplier;
+  int input2_shift = params.input2_shift;
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          const int32_t input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input1_val, input1_multiplier, input1_shift);
+          const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input2_val, input2_multiplier, input2_shift);
+          output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
+        }
+      }
+    }
+  }
+}
+
+#define TFLITE_COMPARISON_OP(name)                                                                \
+  template <typename T>                                                                           \
+  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,    \
+                   const T *input2_data, const Shape &output_shape, bool *output_data)            \
+  {                                                                                               \
+    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,      \
+                         output_data);                                                            \
+  }                                                                                               \
+  template <typename T>                                                                           \
+  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                    \
+                              const Shape &input2_shape, const T *input2_data,                    \
+                              const Shape &output_shape, bool *output_data)                       \
+  {                                                                                               \
+    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,             \
+                                output_shape, output_data);                                       \
+  }                                                                                               \
+  template <typename T>                                                                           \
+  inline void name##WithScaling(ComparisonParams &params, const Shape &input1_shape,              \
+                                const T *input1_data, const Shape &input2_shape,                  \
+                                const T *input2_data, const Shape &output_shape,                  \
+                                bool *output_data)                                                \
+  {                                                                                               \
+    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,           \
+                                       input2_data, output_shape, output_data);                   \
+  }                                                                                               \
+  template <typename T>                                                                           \
+  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,   \
+                                               const Shape &input2_shape, const T *input2_data,   \
+                                               const Shape &output_shape, bool *output_data)      \
+  {                                                                                               \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,           \
+                                               input2_data, output_shape, output_data);           \
+  }                                                                                               \
+  template <typename T>                                                                           \
+  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,              \
+                                    const Shape &input2_shape, const T *input2_data,              \
+                                    const Shape &output_shape, bool *output_data)                 \
+  {                                                                                               \
+    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,  \
+                                           output_shape, output_data);                            \
+  }                                                                                               \
+  template <typename T>                                                                           \
+  inline void Broadcast4DSlow##name##WithScaling(ComparisonParams &params,                        \
+                                                 const Shape &input1_shape, const T *input1_data, \
+                                                 const Shape &input2_shape, const T *input2_data, \
+                                                 const Shape &output_shape, bool *output_data)    \
+  {                                                                                               \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                            \
+        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \
+  }
+
+TFLITE_COMPARISON_OP(Equal);
+TFLITE_COMPARISON_OP(NotEqual);
+TFLITE_COMPARISON_OP(Greater);
+TFLITE_COMPARISON_OP(GreaterEqual);
+TFLITE_COMPARISON_OP(Less);
+TFLITE_COMPARISON_OP(LessEqual);
+#undef TFLITE_COMPARISON_OP
+
+} // namespace cker
+} // namespace nnfw
+
+#endif
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
new file mode 100644
index 000000000..394123e30
--- /dev/null
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CONCATENATION_H__
+#define __NNFW_CKER_CONCATENATION_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cstdint>
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+inline void Concatenation(const ConcatenationParams &params, const Shape *const *input_shapes,
+                          const Scalar *const *input_data, const Shape &output_shape,
+                          Scalar *output_data)
+{
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+  const int concat_dimensions = output_shape.DimensionsCount();
+  assert(axis < concat_dimensions);
+
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++)
+  {
+    assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
+    for (int j = 0; j < concat_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j);
+        UNUSED_RELEASE(dim_checked);
+      }
+    }
+    concat_size += input_shapes[i]->Dims(axis);
+  }
+  assert(concat_size == output_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  // For all input arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < concat_dimensions; ++i)
+  {
+    base_inner_size *= output_shape.Dims(i);
+  }
+
+  Scalar *output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < inputs_count; ++i)
+    {
+      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+      memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void ConcatenationWithScaling(const ConcatenationParams &params,
+                                     const Shape *const *input_shapes,
+                                     const uint8_t *const *input_data, const Shape &output_shape,
+                                     uint8_t *output_data)
+{
+  int axis = params.axis;
+  const int32_t *input_zeropoint = params.input_zeropoint;
+  const float *input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32_t output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  const int concat_dimensions = output_shape.DimensionsCount();
+  assert(axis <= concat_dimensions);
+
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++)
+  {
+    assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
+    for (int j = 0; j < concat_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        assert(input_shapes[i]->Dims(j) == output_shape.Dims(j));
+      }
+    }
+    concat_size += input_shapes[i]->Dims(axis);
+  }
+  assert(concat_size == output_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  // For all input arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < concat_dimensions; ++i)
+  {
+    base_inner_size *= output_shape.Dims(i);
+  }
+
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8_t *output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < inputs_count; ++i)
+    {
+      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+      const uint8_t *input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
+      {
+        memcpy(output_ptr, input_ptr, copy_size);
+      }
+      else
+      {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j)
+        {
+          const int32_t value =
+              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CONCATENATION_H__
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
new file mode 100644
index 000000000..b20bac3ac
--- /dev/null
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CONV_H__
+#define __NNFW_CKER_CONV_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/operation/reference/Conv.h"
+#include "cker/operation/optimized/Conv.h"
+#include <iostream>
+#include <vector>
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace
+{
+// Naive implementation of transpose for floats. Could be optimized to be more
+// cache friendly, but for now it's a one-time cost on first run, and we would
+// prefer to remove the need to do this at all eventually.
+inline void TransposeFloatTensor(const float *input_data, const nnfw::cker::Shape &output_shape,
+                                 float *output_data)
+{
+  const int rows = output_shape.Dims(1);
+  const int cols = output_shape.Dims(0);
+  for (int i = 0; i < rows; ++i)
+  {
+    for (int j = 0; j < cols; ++j)
+    {
+      const float in_value = input_data[i * cols + j];
+      output_data[j * rows + i] = in_value;
+    }
+  }
+}
+} // namespace
+
+class Conv
+{
+public:
+  Conv() : _modified_filter_data(), _im2col_shape(4), _need_im2col(false), _prepared(false) {}
+
+  void prepare(const Shape &filter_shape, const float *filter_data, PaddingType padding_type,
+               bool &is_replaced_weights, uint32_t dilationWidthFactor,
+               uint32_t dilationHeightFactor)
+  {
+    if (!_prepared)
+    {
+      if (usableMultiThreaded(padding_type, dilationWidthFactor, dilationHeightFactor))
+      {
+        transposeFilter(filter_shape, filter_data, is_replaced_weights);
+      }
+      _prepared = true;
+    }
+  }
+
+  void prepareQuant(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape,
+                    uint32_t stride_width, uint32_t stride_height, uint32_t dilation_width_factor,
+                    uint32_t dilation_height_factor)
+  {
+    if (!_prepared)
+    {
+      IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height,
+                       dilation_width_factor, dilation_height_factor);
+      _prepared = true;
+    }
+  }
+
+  void operator()(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                  const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                  const float *bias_data, const Shape &output_shape, float *output_data)
+  {
+    if (usableMultiThreaded(params.padding_type, params.dilation_width_factor,
+                            params.dilation_height_factor))
+    {
+      bool transposed_in_execution = false;
+      if (!_prepared)
+      {
+        // This means that filter is not constant
+        // TODO Apply optimized kernel if multithreaded kernel is slower than optimized kernel by
+        // transposing filter data
+        transposeFilter(filter_shape, filter_data, transposed_in_execution);
+      }
+      multithreaded::Conv(params, input_shape, input_data, filter_shape, &_modified_filter_data[0],
+                          bias_shape, bias_data, output_shape, output_data);
+    }
+    else
+    {
+      // TODO Support optimized kernel
+      reference::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+                      bias_data, output_shape, output_data);
+    }
+  }
+
+  void operator()(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+                  const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape,
+                  const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+  {
+    if (!_prepared)
+    {
+      // This means that input or output are dynamic or filter is not constant
+      IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width,
+                       params.stride_height, params.dilation_width_factor,
+                       params.dilation_height_factor);
+    }
+
+    int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 1;
+
+    // Use heap if size is larger than 8MB
+    if (im2col_size > 8 * 1024 * 1024)
+    {
+      std::unique_ptr<uint8_t[]> im2col_data = std::make_unique<uint8_t[]>(im2col_size);
+      optimized::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+                      bias_data, output_shape, output_data, _im2col_shape, im2col_data.get());
+    }
+    else
+    {
+      uint8_t im2col_data[im2col_size];
+      optimized::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+                      bias_data, output_shape, output_data, _im2col_shape, im2col_data);
+    }
+  }
+
+private:
+  bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor,
+                           int32_t dilation_height_factor)
+  {
+    return padding_type != PaddingType::kNone && std::thread::hardware_concurrency() > 1 &&
+           dilation_width_factor == 1 && dilation_height_factor == 1;
+  }
+
+  void transposeFilter(const Shape &filter_shape, const float *filter_data,
+                       bool &is_replaced_weights)
+  {
+    const auto output_depth = filter_shape.Dims(0);
+    const Shape hwcn_filter_shape{filter_shape.FlatSize() / output_depth, output_depth};
+    _modified_filter_data.resize(hwcn_filter_shape.FlatSize());
+    TransposeFloatTensor(filter_data, hwcn_filter_shape, &_modified_filter_data[0]);
+    is_replaced_weights = true;
+  }
+
+  void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape,
+                        const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+                        uint32_t dilation_width_factor, uint32_t dilation_height_factor)
+  {
+    const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+    const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 ||
+                                         kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1;
+
+    _need_im2col = need_dilated_im2col || need_non_dilated_im2col;
+
+    if (_need_im2col)
+    {
+      _im2col_shape.SetDim(0, output_shape.Dims(0));
+      _im2col_shape.SetDim(1, output_shape.Dims(1));
+      _im2col_shape.SetDim(2, output_shape.Dims(2));
+      _im2col_shape.SetDim(3, input_shape.Dims(3) * kernel_shape.Dims(1) * kernel_shape.Dims(2));
+    }
+  }
+
+private:
+  std::vector<float> _modified_filter_data;
+  Shape _im2col_shape;
+  bool _need_im2col;
+  bool _prepared;
+};
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CONCATENATION_H_
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
new file mode 100644
index 000000000..814a9e019
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
+#define __NNFW_CKER_DEPTHWISE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+#include "cker/operation/optimized/DepthwiseConvUint8.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
+                          const uint8_t *input_data, const Shape &filter_shape,
+                          const uint8_t *filter_data, const Shape &bias_shape,
+                          const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(output_activation_min <= output_activation_max);
+  UNUSED_RELEASE(output_activation_min);
+  UNUSED_RELEASE(output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+  UNUSED_RELEASE(input_depth);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(depth_multiplier);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__)
+//  TODO Use below codes
+
+//  const int stride_width = params.stride_width;
+//  const int stride_height = params.stride_height;
+//  const int pad_width = params.padding_values.width;
+//  const int pad_height = params.padding_values.height;
+//  const int output_shift = params.output_shift;
+//
+//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+//  // parameters are supported.
+//  if (Fast3x3FilterKernelSupported(
+//          input_shape, filter_shape, stride_width, stride_height,
+//          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+//          depth_multiplier, output_shape, output_shift)) {
+//    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
+//                           filter_data, bias_shape, bias_data, output_shape,
+//                           output_data);
+//    return;
+//  }
+#endif
+
+  optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
+                                  bias_shape, bias_data, output_shape, output_data);
+}
+
+inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
+                          const float *input_data, const Shape &filter_shape,
+                          const float *filter_data, const Shape &bias_shape, const float *bias_data,
+                          const Shape &output_shape, float *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(bias_shape);
+
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int ic = 0; ic < input_depth; ++ic)
+        {
+          for (int m = 0; m < depth_multiplier; m++)
+          {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            float total = 0.f;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+                {
+                  float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                  float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            float bias_value = 0.0f;
+            if (bias_data)
+            {
+              bias_value = bias_data[oc];
+            }
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
+                total + bias_value, output_activation_min, output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_DEPTHWISE_CONV_H__
diff --git a/compute/cker/include/cker/operation/Dequantize.h b/compute/cker/include/cker/operation/Dequantize.h
new file mode 100644
index 000000000..c4875812b
--- /dev/null
+++ b/compute/cker/include/cker/operation/Dequantize.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEQUANTIZE_H__
+#define __NNFW_CKER_DEQUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+#ifdef USE_NEON
+namespace
+{
+inline void ScaleWithNewZeroPoint(const int32x4_t input, const float32x4_t scale_dup,
+                                  const float32x4_t zero_times_scale_dup, float32x4_t *output)
+{
+#ifdef __ARM_FEATURE_FMA
+  *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
+#else
+  *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup), zero_times_scale_dup);
+#endif
+}
+} // namespace
+#endif // USE_NEON
+
+inline void Dequantize(const Shape &input_shape, const uint8_t *input_data,
+                       const Shape &output_shape, float *output_data, const float scale,
+                       const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const uint8x8_t input_u8 = vld1_u8(input_data + i);
+    const uint16x8_t input_u16 = vmovl_u8(input_u8);
+    const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif // NEON
+  for (; i < flat_size; ++i)
+  {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+inline void Dequantize(const Shape &input_shape, const int8_t *input_data,
+                       const Shape &output_shape, float *output_data, const float scale,
+                       const int32_t zero_point)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+  for (; i <= flat_size - 8; i += 8)
+  {
+    const int8x8_t input_s8 = vld1_s8(input_data + i);
+    const int16x8_t input_s16 = vmovl_s8(input_s8);
+    const int16x4_t input_s16_low = vget_low_s16(input_s16);
+    const int16x4_t input_s16_high = vget_high_s16(input_s16);
+    const int32x4_t val_low = vmovl_s16(input_s16_low);
+    const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+    float32x4_t result_low, result_high;
+    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+    vst1q_f32(output_data + i, result_low);
+    vst1q_f32(output_data + i + 4, result_high);
+  }
+#endif // NEON
+  for (; i < flat_size; ++i)
+  {
+    const int32_t val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_DEQUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
new file mode 100644
index 000000000..3d1837f47
--- /dev/null
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EINSUM_H__
+#define __NNFW_CKER_EINSUM_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/operation/Helper/Tensor.h"
+#include "cker/operation/Helper/MatmulBCast.h"
+
+#include "Transpose.h"
+#include "BatchMatMul.h"
+
+#include <string>
+#include <vector>
+#include <map>
+#include <numeric>
+#include <algorithm>
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace functor
+{
+
+template <typename Device, typename T, int N> struct StrideFunctor
+{
+  void operator()(const Device &d, typename TTypes<T, N>::ConstTensor input,
+                  const std::vector<int32_t> &strides, typename TTypes<T, N>::Tensor output)
+  {
+
+    Eigen::DSizes<Eigen::DenseIndex, N> dsizes;
+    for (size_t d = 0; d < strides.size(); d++)
+    {
+      dsizes[d] = static_cast<Eigen::DenseIndex>(strides[d]);
+    }
+    for (size_t d = strides.size(); d < N; d++)
+    {
+      dsizes[d] = 1;
+    }
+
+    output.device(d) = input.stride(dsizes);
+  }
+};
+
+template <typename Device, typename T, int N> struct InflateFunctor
+{
+  void operator()(const Device &d, typename TTypes<T, N>::ConstTensor input,
+                  const std::vector<int32_t> &strides, typename TTypes<T, N>::Tensor output)
+  {
+
+    Eigen::DSizes<Eigen::DenseIndex, N> dsizes;
+    for (size_t d = 0; d < strides.size(); d++)
+    {
+      dsizes[d] = static_cast<Eigen::DenseIndex>(strides[d]);
+    }
+    for (size_t d = strides.size(); d < N; d++)
+    {
+      dsizes[d] = 1;
+    }
+
+    output.device(d) = input.inflate(dsizes);
+  }
+};
+
+template <typename Device, typename Reducer> struct ReduceFunctor
+{
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(const Device &d, OUT_T out, IN_T in, const ReductionAxes &reduction_axes,
+                     const Reducer &reducer)
+  {
+    out.device(d) = in.reduce(reduction_axes, reducer);
+  }
+};
+
+template <typename Device, typename T> struct SetZeroFunctor
+{
+  // Computes on device "d": out = out.setZero(),
+  void operator()(const Device &d, typename TTypes<T>::Flat out)
+  {
+    out.device(d) = out.constant(T(0));
+  }
+};
+
+} // namespace functor
+
+using ShapeVec = std::vector<int32_t>;
+using Labels = std::vector<int32_t>;
+using OperandLabels = std::vector<Labels>;
+using LabelCounts = std::vector<int32_t>;
+using OperandLabelCounts = std::vector<LabelCounts>;
+using LabelToDimSizes = std::vector<int32_t>;
+
+// Each dimension is categorized into exactly one of five types based on
+// whether its corresponding label is present in the input and/or the output
+// subscripts.
+enum DimensionType
+{
+  // Batch dimensions are those present in two inputs as well as the output.
+  // They are part of the batch dimensions during Tensor contraction.
+  // Such dimensions may be broadcasting dimensions (those mapping to
+  // ellipsis)
+  // or explicit batch dimensions corresponding to named axis labels.
+  kBroadcasting = 0,
+  kBatch = 1,
+  // Free dimensions are present in exactly one of the inputs, and also the
+  // output. These are non-contracted axes in the Tensor contraction.
+  kFree = 2,
+  // Contract dimensions are present in two inputs, but not the output. These
+  // dimensions are contracted in Tensor contraction.
+  kContract = 3,
+  // Reduce dimensions are present in exactly one input; and not in the output
+  // and are summed over prior to Tensor contraction.
+  kReduce = 4,
+};
+
+namespace
+{
+
+constexpr int kEllipsisLabel = -1;
+
+std::vector<std::string> strSplit(const std::string &text, const std::string delimiter)
+{
+  std::vector<std::string> result;
+
+  size_t start = 0;
+  size_t pos = 0;
+
+  do
+  {
+    pos = text.find(delimiter, start);
+    if (pos == std::string::npos)
+    {
+      result.push_back(text.substr(start, text.size() - start));
+      break;
+    }
+
+    result.push_back(text.substr(start, pos - start));
+    start = pos + delimiter.size();
+  } while (pos != std::string::npos);
+
+  return result;
+}
+
+inline DimensionType getDimensionType(bool is_removed, bool is_unique)
+{
+  if (!is_removed && !is_unique)
+    return kBatch;
+  else if (!is_removed && is_unique)
+    return kFree;
+  else if (is_removed && !is_unique)
+    return kContract;
+  else // is_removed && is_unique
+    return kReduce;
+}
+
+inline Shape copyShape(const Shape &shape)
+{
+  return Shape::ExtendedShape(shape.DimensionsCount(), shape);
+}
+}
+
+class Einsum
+{
+public:
+  Einsum() : _prepared(false)
+  {
+    // DO NOTHING
+  }
+
+  void prepare(std::string &equation)
+  {
+    if (_prepared)
+    {
+      return;
+    }
+
+    // Parse equation
+    parseEquation(equation);
+    _prepared = true;
+  }
+
+  void operator()(std::string &equation, const std::vector<Shape> &input_shapes,
+                  const std::vector<const float *> &input_data, const Shape &output_shape,
+                  float *output_data)
+  {
+    if (!_prepared)
+    {
+      prepare(equation);
+    }
+
+    const int num_inputs = input_shapes.size();
+    std::vector<InputTensor<float>> inputs(num_inputs);
+    for (int i = 0; i < num_inputs; i++)
+    {
+      inputs[i].shape.ReplaceWith(input_shapes[i].DimensionsCount(), input_shapes[i].DimsData());
+      inputs[i].buffer = input_data[i];
+    }
+
+    OperandLabels input_labels(_input_labels);
+    Labels output_labels(_output_labels);
+    std::vector<DimensionType> label_types(_label_types);
+    OperandLabelCounts input_label_counts(_input_label_counts);
+    LabelCounts output_label_counts(_output_label_counts);
+    LabelToDimSizes label_to_dim_sizes;
+
+    processDimensions(inputs, &input_labels, &output_labels, &label_types, &input_label_counts,
+                      &output_label_counts, &label_to_dim_sizes);
+
+    // The reduction phase (a) sums across reduction dimensions, (b) takes
+    // generalized diagonals, and (c) reshapes it into shape
+    //   [(broadcasting) batch shape] + [F,C]
+    // where F and C denote the total (compacted) size of free and contract
+    // dimensions, respectively.
+
+    OperandLabels free_labels(num_inputs);
+    std::vector<Tensor> inputs_reduced(num_inputs);
+    std::vector<bool> swap_free_and_contract(num_inputs);
+    for (int i = 0; i < num_inputs; ++i)
+    {
+      bool temp_swap_free_and_contract = false;
+      reduceOperand<float>(inputs[i], label_types, input_label_counts[i], &input_labels[i],
+                           &free_labels[i], &temp_swap_free_and_contract, &inputs_reduced[i]);
+      swap_free_and_contract[i] = temp_swap_free_and_contract;
+    }
+
+    // After reduction, the inputs should be reshaped to Tensors suitable for
+    // contraction. If num_inputs is 1, the reduced input is simply forwarded to
+    // the output.
+    Tensor contraction_output_reshaped;
+    contractOperands(inputs_reduced, swap_free_and_contract, &contraction_output_reshaped);
+
+    // Copy the batch labels from the contraction output. Recover the batch
+    // shape, which may have been broadcasted.
+    std::vector<int32_t> result_shape_dims(contraction_output_reshaped.shape.DimensionsCount() - 2);
+
+    for (size_t i = 0; i < result_shape_dims.size(); i++)
+    {
+      result_shape_dims[i] = contraction_output_reshaped.shape.Dims(i);
+    }
+
+    int num_labels = label_types.size();
+    Labels result_labels;
+    // All batch dimensions should be present in the contracted result. First
+    // the broadcasting dimensions, then the named batch dimensions.
+    for (int label = 0; label < num_labels; ++label)
+    {
+      if (label_types[label] == kBroadcasting)
+        result_labels.push_back(label);
+    }
+    for (int label = 0; label < num_labels; ++label)
+    {
+      if (label_types[label] == kBatch)
+        result_labels.push_back(label);
+    }
+    for (int i = 0; i < num_inputs; ++i)
+    {
+      for (int label : free_labels[i])
+      {
+        result_labels.push_back(label);
+        result_shape_dims.push_back(label_to_dim_sizes[label]);
+      }
+    }
+
+    Shape result_shape(result_shape_dims.size(), result_shape_dims.data());
+
+    // Reshape the contraction (or reduction) result to its expanded shape:
+    // [(broadcasted) batch shape] + [free shape 0] + [free shape 1].
+    Tensor contraction_output;
+    copyFrom(contraction_output_reshaped, result_shape, &contraction_output);
+
+    // Inflate the output if necessary. (E.g. for the equation 'i->iii' which
+    // may arise while computing gradient of a regular Einsum).
+    // TODO(anudhyan): It's possible that Eigen's contract and inflate can be
+    // chained here to avoid materializing an intermediate.
+    Tensor output_inflated;
+    strideOrInflate<float>(contraction_output, result_labels, output_label_counts,
+                           true /* should_inflate */, &output_inflated);
+
+    if (output_inflated.shape.DimensionsCount() > contraction_output.shape.DimensionsCount())
+    {
+      // We inflated the output. Modify result labels accordingly.
+      Labels inflated_labels;
+      for (int label : result_labels)
+      {
+        inflated_labels.insert(inflated_labels.end(), output_label_counts[label], label);
+      }
+      result_labels.swap(inflated_labels);
+    }
+
+    // Find the permutation to map the result labels to the output labels. Note
+    // that both the result and the final output may have the repeated labels,
+    // in which case the permutation preserves the left-to-right ordering.
+    // E.g. if result labels are [0, 0, 1] and output is [0, l, 0] then the
+    // permutation should be [0, 2, 1]. We also use the fact that repeated
+    // labels in the result are adjacent to each other.
+    std::vector<int32_t> output_permutation(output_labels.size());
+    std::vector<int32_t> label_to_position(num_labels, -1);
+    for (size_t i = 0; i < result_labels.size(); ++i)
+    {
+      // Remember the position of only the leftmost result label.
+      if (label_to_position[result_labels[i]] == -1)
+      {
+        label_to_position[result_labels[i]] = i;
+      }
+    }
+    for (size_t i = 0; i < output_labels.size(); ++i)
+    {
+      output_permutation[i] = label_to_position[output_labels[i]];
+      // We have found the leftmost occurrence. The next one would be adjacent.
+      label_to_position[output_labels[i]] += 1;
+    }
+
+    InputTensor<float> temp_inflated;
+    temp_inflated.shape.ReplaceWith(output_inflated.shape.DimensionsCount(),
+                                    output_inflated.shape.DimsData());
+    temp_inflated.buffer = (reinterpret_cast<const float *>(output_inflated.buffer));
+    ;
+
+    Tensor output;
+    transposeOperand<float>(temp_inflated, output_permutation, &output);
+
+    memcpy(output_data, output.buffer, output_shape.FlatSize() * sizeof(float));
+
+    temp_operand.clear();
+  }
+
+private:
+  void parseEquation(std::string &equation)
+  {
+    std::vector<std::string> input_str;
+    std::string output_str;
+
+    parseEinsumEquation(equation, input_str, output_str);
+
+    // Temporary map from single character labels to (consecutive) integer
+    // labels.
+    std::map<char, int> label_mapping;
+    int num_inputs = input_str.size();
+    _input_labels.resize(num_inputs);
+
+    // Map from single characters to integer labels.
+    for (int i = 0; i < num_inputs; ++i)
+    {
+      mapToLabels(input_str[i], _input_labels.at(i), label_mapping);
+    }
+    mapToLabels(output_str, _output_labels, label_mapping);
+
+    // Compute counts for input and output labels.
+    int num_labels = label_mapping.size();
+    _input_label_counts.resize(num_inputs);
+    _input_has_ellipsis.resize(num_inputs);
+    for (int i = 0; i < num_inputs; ++i)
+    {
+      _input_label_counts.at(i).resize(num_labels);
+      for (const int label : _input_labels.at(i))
+      {
+        if (label != kEllipsisLabel)
+          _input_label_counts.at(i)[label] += 1;
+        else
+          _input_has_ellipsis.at(i) = true;
+      }
+    }
+    _output_label_counts.resize(num_labels);
+    for (const int label : _output_labels)
+    {
+      if (label != kEllipsisLabel)
+        _output_label_counts.at(label) += 1;
+      else
+        _output_has_ellipsis = true;
+    }
+
+    // Map each label to a unique DimensionType.
+    _label_types.resize(num_labels);
+    for (int label = 0; label < num_labels; ++label)
+    {
+      bool removed = (_output_label_counts[label] == 0);
+      bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 ||
+                    _input_label_counts[1][label] == 0;
+      _label_types[label] = getDimensionType(removed, unique);
+    }
+  }
+
+  void parseEinsumEquation(const std::string &equation, std::vector<std::string> &input_subscripts,
+                           std::string &output_subscript)
+  {
+    std::vector<std::string> inputs_and_output_subscripts = strSplit(equation, "->");
+    if (inputs_and_output_subscripts.size() != 2)
+    {
+      throw std::runtime_error{"Einsum: Expecting exactly one '->' in einsum equation: " +
+                               equation};
+    }
+
+    output_subscript = inputs_and_output_subscripts[1];
+    input_subscripts = strSplit(inputs_and_output_subscripts[0], ",");
+    if (input_subscripts.size() != 1 && input_subscripts.size() != 2)
+    {
+      throw std::runtime_error{"Einsum: Expecting 1 or 2 input subscripts in equation '" +
+                               equation + "' but got: " + std::to_string(input_subscripts.size())};
+    }
+  }
+
+  // Maps the character labels to consecutive integers.
+  void mapToLabels(const std::string &subscript, Labels &labels, std::map<char, int> &label_mapping)
+  {
+    for (size_t i = 0; i < subscript.size(); ++i)
+    {
+      const char label_char = subscript[i];
+      if (label_char == '.')
+      {
+        labels.push_back(kEllipsisLabel);
+        i += 2; // Skip next 2 characters as well.
+        continue;
+      }
+      if (label_mapping.find(label_char) == label_mapping.end())
+      {
+        const int next_label = label_mapping.size();
+        label_mapping[label_char] = next_label;
+      }
+      const int mapped_label = label_mapping[label_char];
+      labels.push_back(mapped_label);
+    }
+  }
+
+  template <typename T>
+  void processDimensions(const std::vector<InputTensor<T>> &inputs, OperandLabels *input_labels,
+                         Labels *output_labels, std::vector<DimensionType> *label_types,
+                         OperandLabelCounts *input_label_counts, LabelCounts *output_label_counts,
+                         LabelToDimSizes *label_to_dim_sizes)
+  {
+    if (inputs.size() != input_labels->size())
+    {
+      throw std::runtime_error{"Expected " + std::to_string(input_labels->size()) +
+                               " inputs but got: " + std::to_string(inputs.size())};
+    }
+    const int num_inputs = inputs.size();
+
+    // We infer the number of broadcasting dimensions by taking the maximum rank
+    // among the broadcasting subshapes of the input.
+    int max_bcast_dims = 0;
+    const int num_named_labels = label_types->size();
+    label_to_dim_sizes->resize(num_named_labels);
+    for (int i = 0; i < num_inputs; ++i)
+    {
+      Labels *labels = &(*input_labels)[i];
+
+      if (!_input_has_ellipsis[i])
+      {
+        if (inputs[i].shape.DimensionsCount() != ((int32_t)labels->size()))
+        {
+          throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank " +
+                                   std::to_string(labels->size()) + " but got: " +
+                                   std::to_string(inputs[i].shape.DimensionsCount())};
+        }
+        for (size_t label_idx = 0; label_idx < labels->size(); ++label_idx)
+        {
+          const int label = (*labels)[label_idx];
+          recordLabelToDimension(label, label_idx, inputs[i].shape, label_to_dim_sizes);
+        }
+        continue;
+      }
+
+      // Input has an ellipsis.
+      if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size())
+      {
+        throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " +
+                                 std::to_string(labels->size() - 1) + " but got: " +
+                                 std::to_string(inputs[i].shape.DimensionsCount())};
+      }
+      int ellipsis_axis = -1;
+      const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1;
+      for (size_t label_idx = 0; label_idx < labels->size(); ++label_idx)
+      {
+        const int label = (*labels)[label_idx];
+        if (label == kEllipsisLabel)
+        {
+          ellipsis_axis = label_idx;
+          continue;
+        }
+        // Current label is not an ellipsis.
+        const int axis = label_idx + (ellipsis_axis == -1 ? 0 : num_bcast_dims - 1);
+        recordLabelToDimension(label, axis, inputs[i].shape, label_to_dim_sizes);
+      }
+      // Found an ellipsis. Replace 'kEllipsisLabel' with broadcasting
+      // dimensions.
+      if (ellipsis_axis != -1)
+      {
+        insertBroadcastLabels(num_bcast_dims, num_named_labels, ellipsis_axis, labels,
+                              &input_label_counts->at(i));
+        max_bcast_dims = std::max(max_bcast_dims, num_bcast_dims);
+      }
+    }
+
+    std::vector<bool>::iterator it_input =
+        std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
+    if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis)
+    {
+      return;
+    }
+    // Insert broadcasting dimensions in the output labels.
+    auto it = std::find(output_labels->begin(), output_labels->end(), kEllipsisLabel);
+    if (it != output_labels->end())
+    {
+      const int ellipsis_axis = it - output_labels->begin();
+      insertBroadcastLabels(max_bcast_dims, num_named_labels, ellipsis_axis, output_labels,
+                            output_label_counts);
+    }
+    else if (max_bcast_dims > 0)
+    {
+      std::runtime_error{"Output contains " + std::to_string(max_bcast_dims) +
+                         " broadcasting dimension(s) but no ellipsis " +
+                         "(...) was found in the output subscripts."};
+    }
+    // Populate DimensionType for the new broadcasting labels.
+    label_types->resize(num_named_labels + max_bcast_dims, kBroadcasting);
+  }
+
+  void recordLabelToDimension(const int32_t label, const int axis, const Shape &input_shape,
+                              LabelToDimSizes *label_to_dim_sizes)
+  {
+    const int32_t input_dim = input_shape.Dims(axis);
+    // We know that label_to_dim_sizes has the size to accommodate named labels.
+    if (label_to_dim_sizes->at(label) != 0 && label_to_dim_sizes->at(label) != input_dim)
+    {
+      std::runtime_error{"Expected dimension " + std::to_string(label_to_dim_sizes->at(label)) +
+                         " at axis " + std::to_string(axis) +
+                         " of the input shaped but got dimension " + std::to_string(input_dim)};
+    }
+    (*label_to_dim_sizes)[label] = input_dim;
+  }
+
+  void insertBroadcastLabels(int num_bcast_dims, int num_named_labels, int ellipsis_axis,
+                             Labels *labels, LabelCounts *label_counts)
+  {
+    labels->erase(labels->begin() + ellipsis_axis);
+    labels->insert(labels->begin() + ellipsis_axis, num_bcast_dims, 0);
+    std::iota(labels->begin() + ellipsis_axis, labels->begin() + ellipsis_axis + num_bcast_dims,
+              num_named_labels);
+    // Increment label counts. Since these are new labels, the count is set
+    // to 1.
+    label_counts->resize(num_named_labels + num_bcast_dims, 1);
+  }
+
+  template <typename T>
+  void reduceOperand(const InputTensor<T> &input, const std::vector<DimensionType> &label_types,
+                     const LabelCounts &label_counts, Labels *labels, Labels *free_labels,
+                     bool *swap_free_and_contract, Tensor *output)
+  {
+    // Find the permutation to transpose the input dimensions in the order of
+    // DimensionType; i.e. batch, free, contract and reduce dimensions. This
+    // makes it more convenient to invoke Reduce/Contract operations.
+    std::vector<int32_t> permutation(input.shape.DimensionsCount());
+    std::iota(permutation.begin(), permutation.end(), 0);
+    Tensor input_transposed;
+
+    // Check if we can avoid the transpose. We need to flip the adj_x (or adj_y)
+    // flag during BatchMatMul. This is an extra optimization not necessary for
+    // correctness.
+    if (shouldSwapFreeAndContract(*labels, label_types))
+    {
+      *swap_free_and_contract = true;
+    }
+    else
+    {
+      std::sort(permutation.begin(), permutation.end(), [&](int i, int j) {
+        int label_i = (*labels)[i];
+        int label_j = (*labels)[j];
+        return std::tie(label_types[label_i], label_i) < std::tie(label_types[label_j], label_j);
+      });
+    }
+    // Transpose the input so that DimensionTypes are in order.
+    transposeOperand<T>(input, permutation, &input_transposed);
+
+    permuteLabels(permutation, labels);
+
+    // Take the generalized diagonal for dimensions with repeated axis labels.
+    Tensor input_deduped;
+    labels->erase(std::unique(labels->begin(), labels->end()), labels->end());
+    strideOrInflate<T>(input_transposed, *labels, label_counts, false /* should_inflate */,
+                       &input_deduped);
+
+    // Reshape denotes the rank-5 shape [broadcast, batch, free, contract,
+    // reduce] where we've compacted the dimensions of each DimensionType.
+    std::vector<int32_t> reshape(5, 1);
+
+    // The output shape is [batch shape] + [free size, contract size]
+    // That is, the batch shape is preserved (for broadcasting while
+    // contracting) while the free dims and contract dims are compressed to one
+    // dimension each.
+    Shape output_shape;
+    std::vector<int32_t> output_shape_dims;
+    for (size_t label_idx = 0; label_idx < labels->size(); ++label_idx)
+    {
+      const int label = labels->at(label_idx);
+      int32_t dim = input_deduped.shape.Dims(label_idx);
+      if (label_types[label] == kBroadcasting || label_types[label] == kBatch)
+      {
+        output_shape_dims.push_back(dim);
+      }
+      else if (label_types[label] == kFree)
+      {
+        free_labels->push_back(label);
+      }
+      reshape[label_types[label]] *= dim;
+    }
+
+    if (*swap_free_and_contract)
+      std::swap(reshape[kFree], reshape[kContract]);
+
+    output_shape_dims.push_back(reshape[kFree]);
+    output_shape_dims.push_back(reshape[kContract]);
+
+    output_shape.ReplaceWith(output_shape_dims.size(), output_shape_dims.data());
+
+    if (reshape[kReduce] == 1)
+    { // No need to actually reduce.
+      return copyFrom(input_deduped, output_shape, output);
+    }
+
+    allocateTemp(output_shape, output);
+
+    using Reducer = Eigen::internal::SumReducer<T>;
+    using Index = typename TTypes<T>::Tensor::Index;
+
+    const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
+
+    // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor.
+    const int32_t output_size =
+        reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
+    functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce(
+        device, output->shaped<T, 1>({output_size}),
+        input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
+        Reducer());
+  }
+
+  bool shouldSwapFreeAndContract(const Labels &labels,
+                                 const std::vector<DimensionType> &label_types)
+  {
+    // Check that ordering is according to dimension type, with the role of
+    // free and contract dimensions swapped.
+    std::vector<int> remap = {0, 1, 3, 2, 4};
+    for (size_t i = 0; i + 1 < labels.size(); ++i)
+    {
+      const int dimtype_a = remap[label_types[labels[i]]];
+      const int dimtype_b = remap[label_types[labels[i + 1]]];
+      if (dimtype_a > dimtype_b || (dimtype_a == dimtype_b && labels[i] > labels[i + 1]))
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  template <typename T>
+  void transposeOperand(const InputTensor<T> &input, const std::vector<int32_t> &permutation,
+                        Tensor *output)
+  {
+    if (!shouldTranspose(input.shape, permutation))
+    {
+      copyFrom(input, input.shape, output);
+      return;
+    }
+    Shape transposed_shape(input.shape.DimensionsCount());
+    for (int i = 0; i < input.shape.DimensionsCount(); ++i)
+    {
+      transposed_shape.SetDim(i, input.shape.Dims(permutation[i]));
+    }
+    // For empty Tensors, just change the shape. E.g. we may need to transpose
+    // from shape [1, 0, 5] to [5, 1, 0].
+    if (input.shape.FlatSize() == 0)
+    {
+      copyFrom(input, transposed_shape, output);
+      return;
+    }
+
+    temp_operand.emplace_back(std::make_unique<T[]>(transposed_shape.FlatSize()));
+    T *new_buffer = temp_operand.back().get();
+
+    TransposeParams transpose_params;
+    transpose_params.perm_count = permutation.size();
+    for (size_t i = 0; i < permutation.size(); i++)
+    {
+      transpose_params.perm[i] = permutation[i];
+    }
+
+    Transpose<T>(transpose_params, input.shape, input.buffer, transposed_shape, new_buffer);
+
+    output->shape.ReplaceWith(transposed_shape.DimensionsCount(), transposed_shape.DimsData());
+    output->buffer = new_buffer;
+  }
+
+  bool shouldTranspose(const Shape &input_shape, const std::vector<int32_t> &permutation)
+  {
+    if (input_shape.DimensionsCount() < 2)
+      return false;
+    for (size_t i = 0; i < permutation.size(); ++i)
+    {
+      if (permutation[i] != (int32_t)i)
+        return true;
+    }
+    return false;
+  }
+
+  template <typename T>
+  void copyFrom(const InputTensor<T> &input, const Shape &shape, Tensor *output)
+  {
+    Tensor temp_tensor;
+    temp_tensor.shape.ReplaceWith(input.shape.DimensionsCount(), input.shape.DimsData());
+    temp_operand.emplace_back(std::make_unique<float[]>(input.shape.FlatSize()));
+    temp_tensor.buffer = temp_operand.back().get();
+    memcpy(temp_tensor.buffer, input.buffer, input.shape.FlatSize() * sizeof(float));
+
+    copyFrom(temp_tensor, shape, output);
+  }
+
+  void copyFrom(const Tensor &input, const Shape &shape, Tensor *output)
+  {
+    if (output->copyFrom(input, shape))
+      return;
+
+    throw std::runtime_error{"Einsum: Encountered error while reshaping a Tensor"};
+  }
+
+  // Permutes the labels according to the given permutation.
+  void permuteLabels(const std::vector<int32_t> &permutation, Labels *labels)
+  {
+    Labels permuted_labels(labels->size());
+    for (size_t i = 0; i < labels->size(); ++i)
+    {
+      permuted_labels[i] = (*labels)[permutation[i]];
+    }
+    labels->swap(permuted_labels);
+  }
+
+  // If there are repeated labels in either the input or output, then this
+  // strides the input (e.g. iii->i) or inflates it (e.g. i->iii), respectively.
+  template <typename T>
+  void strideOrInflate(const Tensor &input, const Labels &labels, const LabelCounts &label_counts,
+                       const bool should_inflate, Tensor *output)
+  {
+    // Return early if there are no repeated indices.
+    if (std::all_of(label_counts.begin(), label_counts.end(), [](int c) { return c <= 1; }))
+    {
+      return copyFrom(input, input.shape, output);
+    }
+    // We reshape so that each repeated label is compressed to one dimension.
+    // E.g. For iiij -> ij, The shape [3, 3, 3, 5] would be compressed to [27,
+    // 5]. Striding appropriately (in this case with strides 14 (=1+3+9) and 1)
+    // recovers the generalized diagonal of shape [3, 5].
+    std::vector<int32_t> reshape;
+    std::vector<int32_t> strides;
+    // Strided and inflated shapes correspond to input and output shapes,
+    // respectively, should_inflate is true (vice-versa if should_inflate is
+    // false). E.g. they are [3, 5] and [3, 3, 3, 5] in the above example.
+    Shape strided_shape;
+    Shape inflated_shape;
+    std::vector<int32_t> strided_shape_dims;
+    std::vector<int32_t> inflated_shape_dims;
+    for (int label : labels)
+    {
+      const int32_t count = label_counts[label];
+      const int current_axis =
+          should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
+      const int32_t dim = input.shape.Dims(current_axis);
+      strided_shape_dims.push_back(dim);
+      inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim);
+      const int32_t reshape_dim = std::pow(dim, count);
+      reshape.push_back(reshape_dim);
+      // While taking the d-diagonal in a rank k Tensor, we take d
+      // equally-spaced elements including the first and last element. Then, (k
+      // - 1) * stride = d^k - 1, or, stride = (d^k - 1)/(d - 1).
+      const int32_t stride = (dim > 1 && count > 1) ? (reshape_dim - 1) / (dim - 1) : 1;
+      strides.push_back(stride);
+    }
+
+    strided_shape.ReplaceWith(strided_shape_dims.size(), strided_shape_dims.data());
+    inflated_shape.ReplaceWith(inflated_shape_dims.size(), inflated_shape_dims.data());
+
+    Shape output_shape = Shape(should_inflate ? inflated_shape : strided_shape);
+
+    output->shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
+    temp_operand.emplace_back(std::make_unique<float[]>(output_shape.FlatSize()));
+    output->buffer = temp_operand.back().get();
+
+    const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
+
+    switch (reshape.size())
+    {
+#define NDIMS_CASE(N)                                                                      \
+  case N:                                                                                  \
+  {                                                                                        \
+    if (should_inflate)                                                                    \
+    {                                                                                      \
+      auto output_map = output->shaped<T, N>(reshape);                                     \
+      auto input_map = input.shaped<T, N>(strided_shape_dims);                             \
+      functor::InflateFunctor<Eigen::ThreadPoolDevice, T, N>()(device, input_map, strides, \
+                                                               output_map);                \
+    }                                                                                      \
+    else                                                                                   \
+    {                                                                                      \
+      auto input_map = input.shaped<T, N>(reshape);                                        \
+      auto output_map = output->shaped<T, N>(strided_shape_dims);                          \
+      functor::StrideFunctor<Eigen::ThreadPoolDevice, T, N>()(device, input_map, strides,  \
+                                                              output_map);                 \
+    }                                                                                      \
+  }                                                                                        \
+  break;
+      NDIMS_CASE(1);
+      NDIMS_CASE(2);
+      NDIMS_CASE(3);
+      NDIMS_CASE(4);
+      NDIMS_CASE(5);
+      NDIMS_CASE(6);
+      default:
+        throw std::runtime_error{"Unsupported rank: " + std::to_string(reshape.size()) +
+                                 " while handling repeated indices. Up to rank 6 is supported."};
+#undef NDIMS_CASE
+    }
+  }
+
+  void allocateTemp(const Shape &shape, Tensor *output)
+  {
+    output->shape.ReplaceWith(shape.DimensionsCount(), shape.DimsData());
+    temp_operand.emplace_back(std::make_unique<float[]>(shape.FlatSize()));
+    output->buffer = temp_operand.back().get();
+  }
+
+  // Contracts the inputs along the last axis. (or the second last if the
+  // corresponding value of swap_free_and_contract is true). The batch
+  // dimensions are broadcast to the output shape.
+  // TODO(anudhyan): Factor this function into a BatchMatMul functor and support
+  // transpose_x and transpose_y attributes (in addition to adj_x and adj_y).
+  // Also, the BatchMatMul might devolve into a component-wise multiplication
+  // when the matrix shape is [1,1]; in this case BatchMatMul functor would be
+  // very inefficient. The functor should detect if this is the case and perform
+  // componentwise multiplication functor instead.
+  void contractOperands(std::vector<Tensor> &inputs, std::vector<bool> &swap_free_and_contract,
+                        Tensor *output)
+  {
+    if (inputs.size() == 1)
+      return copyFrom(inputs[0], inputs[0].shape, output);
+
+    MatMulBCast bcast(inputs[0].shape, inputs[1].shape);
+    if (!bcast.IsValid())
+    {
+      throw std::runtime_error{"Einsum: Invalid broadcasting dimensions"};
+    }
+
+    Tensor lhs;
+    reshapeToRank3(inputs[0], bcast.x_batch_size(), &lhs);
+    Tensor rhs;
+    reshapeToRank3(inputs[1], bcast.y_batch_size(), &rhs);
+    Shape old_output_shape = bcast.output_batch_shape();
+    Shape output_shape(old_output_shape.DimensionsCount() + inputs.size());
+    for (int i = 0; i < old_output_shape.DimensionsCount(); i++)
+    {
+      output_shape.SetDim(i, old_output_shape.Dims(i));
+    }
+
+    for (size_t i = 0; i < inputs.size(); ++i)
+    {
+      const int32_t free_axis =
+          inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
+      output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis));
+    }
+    bool adj_x = swap_free_and_contract[0];
+    bool adj_y = !swap_free_and_contract[1];
+
+    allocateTemp(output_shape, output);
+
+    const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
+
+    if (lhs.shape.FlatSize() == 0 || rhs.shape.FlatSize() == 0)
+    {
+      functor::SetZeroFunctor<Eigen::ThreadPoolDevice, float> set_zero;
+      set_zero(device,
+               typename TTypes<float, 1>::Tensor(output->base<float>(), output->shape.FlatSize()));
+      return;
+    }
+
+    Tensor output_reshaped;
+    reshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped);
+
+    // LaunchBatchMatMul::Launch(lhs, rhs, adj_x, adj_y, bcast, &output_reshaped);
+    BatchMatMul batchMatMul;
+    batchMatMul.prepare(lhs.shape, rhs.shape, adj_x, adj_y);
+    batchMatMul(lhs.shape, lhs.base<float>(), rhs.shape, rhs.base<float>(), adj_x, adj_y,
+                output_reshaped.shape, output_reshaped.base<float>());
+  }
+
+  void reshapeToRank3(const Tensor &input, int batch_size, Tensor *output)
+  {
+    const int rank = input.shape.DimensionsCount();
+    Shape output_shape({batch_size, input.shape.Dims(rank - 2), input.shape.Dims(rank - 1)});
+    copyFrom(input, output_shape, output);
+  }
+
+private:
+  bool _prepared;
+
+  OperandLabels _input_labels;
+  Labels _output_labels;
+  std::vector<DimensionType> _label_types;
+  OperandLabelCounts _input_label_counts;
+  LabelCounts _output_label_counts;
+  std::vector<bool> _input_has_ellipsis;
+  bool _output_has_ellipsis = false;
+
+  std::vector<std::unique_ptr<float[]>> temp_operand;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EINSUM_H__
diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h
new file mode 100644
index 000000000..9d080d89b
--- /dev/null
+++ b/compute/cker/include/cker/operation/Elementwise.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ELEMENTWISE_H__
+#define __NNFW_CKER_ELEMENTWISE_H__
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Sin(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = std::sin(input_data[i]);
+  }
+}
+
+inline void Cos(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = std::cos(input_data[i]);
+  }
+}
+
+inline void Abs(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = input_map.array().abs();
+}
+
+inline void Rsqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                  float *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = 1.f / std::sqrt(input_data[i]);
+  }
+}
+
+template <typename T>
+inline void Neg(const Shape &input_shape, const T *input_data, const Shape &output_shape,
+                T *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = -input_data[i];
+  }
+}
+
+inline void Log(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = std::log(input_data[i]);
+  }
+}
+
+inline void Floor(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                  float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = std::floor(input_data[i]);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ELEMENTWISE_H__
diff --git a/compute/cker/include/cker/operation/Erf.h b/compute/cker/include/cker/operation/Erf.h
new file mode 100644
index 000000000..a9be3654a
--- /dev/null
+++ b/compute/cker/include/cker/operation/Erf.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ERF_H__
+#define __NNFW_CKER_ERF_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Erf(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = std::erf(input_data[i]);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ERF_H__
diff --git a/compute/cker/include/cker/operation/Exp.h b/compute/cker/include/cker/operation/Exp.h
new file mode 100644
index 000000000..ed3c73d73
--- /dev/null
+++ b/compute/cker/include/cker/operation/Exp.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EXP_H__
+#define __NNFW_CKER_EXP_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Exp(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = std::exp(input_data[i]);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EXP_H__
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
new file mode 100644
index 000000000..14daf9839
--- /dev/null
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FILL_H__
+#define __NNFW_CKER_FILL_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+template <typename T>
+inline void Fill(const Shape &input_shape, int *input_data, const T value_data,
+                 const Shape &output_shape, T output_data)
+{
+  int input_size = input_shape.FlatSize();
+  int output_size = 1;
+  for (int i = 0; i < input_size; i++)
+  {
+    output_size *= input_data[i];
+  }
+
+  if (output_size == output_shape.FlatSize())
+  {
+    for (int i = 0; i < output_size; i++)
+    {
+      output_data[i] = *value_data;
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output");
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FILL_H__
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
new file mode 100644
index 000000000..958532402
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_H__
+#define __NNFW_CKER_FULLY_CONNECTED_H__
+
+#include <ruy/context.h>
+#include "cker/operation/FullyConnectedDense16x1.h"
+#include "cker/operation/FullyConnectedSparse16x1.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+class FCTempArena
+{
+public:
+  FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
+  {
+    // DO NOTHING
+  }
+
+  void prepare(const Shape &input_shape, const Shape &weights_shape)
+  {
+    auto input_size = input_shape.FlatSize();
+    input_quantized.resize(input_size);
+
+    assert(weights_shape.DimensionsCount() == 2);
+    int batch_size = input_size / weights_shape.Dims(1);
+    scaling_factors.resize(batch_size);
+    prepared = true;
+  }
+
+public:
+  bool prepared;
+  std::vector<int8_t> input_quantized;
+  std::vector<float> scaling_factors;
+  std::vector<int32_t> accum_scratch;
+};
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+                           const float *input_data, const Shape &weights_shape,
+                           const float *weights_data, const Shape &, const float *bias_data,
+                           const Shape &, float *output_data)
+{
+  int total_input_size = input_shape.FlatSize();
+  int input_size = weights_shape.Dims(1);
+  const int batch_size = total_input_size / input_size;
+  const int num_units = weights_shape.Dims(0);
+
+  // Output = bias if bias tensor exists.
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batch_size * num_units);
+  }
+
+  // Compute output += weight * input
+  MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
+                                      output_data, /*result_stride=*/1);
+
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
+}
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+                           const uint8_t *input_data, const Shape &filter_shape,
+                           const uint8_t *filter_data, const Shape &bias_shape,
+                           const int32_t *bias_data, const Shape &output_shape,
+                           uint8_t *output_data)
+{
+  UNUSED_RELEASE(input_shape);
+  UNUSED_RELEASE(bias_shape);
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  assert(filter_shape.DimensionsCount() >= 2);
+  assert(output_shape.DimensionsCount() >= 1);
+
+  assert(output_activation_min <= output_activation_max);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth =
+      MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int out_c = 0; out_c < output_depth; ++out_c)
+    {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d)
+      {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data)
+      {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
+    }
+  }
+}
+
+inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape,
+                                 const float *input_data, const Shape &filter_shape,
+                                 const int8_t *filter_data, const Shape &, const float *bias_data,
+                                 const Shape &output_shape, float *output_data,
+                                 FCTempArena &temp_arena, ruy::Context *ruy_context)
+{
+  int total_input_size = input_shape.FlatSize();
+  const int input_size = filter_shape.Dims(1);
+  const int batch_size = total_input_size / input_size;
+  const int num_units = filter_shape.Dims(0);
+
+  // Output = bias if bias tensor exists.
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batch_size * num_units);
+  }
+
+  // Save matrix multiplication computation for all zero input.
+  if (IsZeroVector(input_data, total_input_size))
+  {
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+    return;
+  }
+
+  // Quantize input from float to uint8 + quantization params (scaling factor).
+  float unused_min, unused_max;
+  float *scaling_factors_ptr = temp_arena.scaling_factors.data();
+  int8_t *quant_data = temp_arena.input_quantized.data();
+
+  // Quantize each batch independently.
+  for (int b = 0; b < batch_size; ++b)
+  {
+    const int offset = b * input_size;
+    SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
+                            &unused_max, &scaling_factors_ptr[b]);
+    // Incorporate scaling of the filter.
+    scaling_factors_ptr[b] *= params.weights_scale;
+  }
+
+// Compute output += weight * quantized_input
+#ifdef USE_RUY_GEMV
+  auto output_size = output_shape.FlatSize();
+  temp_arena.accum_scratch.resize(output_size);
+  int32_t *scratch = temp_arena.accum_scratch.data();
+  MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
+                                      scaling_factors_ptr, batch_size, scratch, output_data,
+                                      /*result_stride=*/1, ruy_context);
+#else
+  MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
+                                      scaling_factors_ptr, batch_size, output_data,
+                                      /*result_stride=*/1);
+  UNUSED_RELEASE(ruy_context);
+  UNUSED_RELEASE(output_shape);
+#endif
+
+  // Apply activation function to floats.
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
+  return;
+}
+
+inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
+                                             const Shape &input_shape, const float *input_data,
+                                             const Shape &weights_shape, const float *weights_data,
+                                             const Shape &bias_shape, const float *bias_data,
+                                             const Shape &output_shape, float *output_data,
+                                             const uint16_t *w1_segments,
+                                             const uint16_t *w1_indices)
+{
+  UNUSED_RELEASE(params);
+  UNUSED_RELEASE(input_shape);
+
+  assert(weights_shape.DimensionsCount() == 2);
+  assert(output_shape.DimensionsCount() == 2);
+
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth =
+      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+
+  UNUSED_RELEASE(bias_shape);
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batches * output_depth);
+  }
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
+    {
+      for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        int idx_1 = w1_indices[pw1];
+        output_data[b * output_depth + idx_0] +=
+            weights_data[pw1] * input_data[b * accum_depth + idx_1];
+      }
+    }
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FULLY_CONNECTED_H__
diff --git a/compute/cker/include/cker/operation/FullyConnectedDense16x1.h b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h
new file mode 100644
index 000000000..a7e9efd7f
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+#if defined(__aarch64__) && defined(USE_NEON)
+inline void FullyConnected16x1Float32(const FullyConnectedParams &params, const Shape &input_shape,
+                                      const float *input_data, const Shape &weights_shape,
+                                      const float *weights_data, const Shape &,
+                                      const float *bias_data, const Shape &, float *output_data)
+{
+  int total_input_size = input_shape.FlatSize();
+  int input_size = weights_shape.Dims(1);
+  const int batch_size = total_input_size / input_size;
+  const int num_units = weights_shape.Dims(0);
+
+  float *out = output_data;
+  const float *weights = weights_data;
+  int rows = num_units;
+  int cols = input_size;
+  int col_stride = input_size;
+  const float *x = input_data;
+
+  // Output = bias if bias tensor exists.
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batch_size * num_units);
+  }
+
+  //  rows : out, cols : in
+  int i, j;
+  for (i = 0; i < rows; i += 16)
+  {
+    const float *w = &weights[i * col_stride];
+
+    /* keep y[0..15] in registers for duration of inner loop */
+    float *__restrict y = &out[i];
+
+    float32x4_t y0_3 = vld1q_f32(&y[0]);
+    float32x4_t y4_7 = vld1q_f32(&y[4]);
+    float32x4_t y8_11 = vld1q_f32(&y[8]);
+    float32x4_t y12_15 = vld1q_f32(&y[12]);
+
+    for (j = 0; j < cols; j++)
+    {
+      float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+      float32x4_t xj;
+
+      xj = vld1q_dup_f32(&x[j]);
+
+      wvec0_3 = vld1q_f32(&w[0]);
+      y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
+      wvec4_7 = vld1q_f32(&w[4]);
+      y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
+      wvec8_11 = vld1q_f32(&w[8]);
+      y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
+      wvec12_15 = vld1q_f32(&w[12]);
+      y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
+
+      w += 16;
+    }
+
+    /* save y[0..15] back to memory */
+
+    vst1q_f32(&y[0], y0_3);
+    vst1q_f32(&y[4], y4_7);
+    vst1q_f32(&y[8], y8_11);
+    vst1q_f32(&y[12], y12_15);
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
+}
+#endif
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
new file mode 100644
index 000000000..28ae7a3bc
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
+                                           const Shape &input_shape, const float *input_data,
+                                           const Shape &weights_shape, const float *weights_data,
+                                           const Shape &bias_shape, const float *bias_data,
+                                           const Shape &output_shape, float *output_data,
+                                           const uint16_t *w1_segments, const uint16_t *w1_indices)
+{
+  UNUSED_RELEASE(input_shape);
+
+  assert(weights_shape.DimensionsCount() == 2);
+  assert(output_shape.DimensionsCount() == 2);
+
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth =
+      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+
+  UNUSED_RELEASE(bias_shape);
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batches * output_depth);
+  }
+  for (int b = 0; b < batches; ++b)
+  {
+    int depth_size = output_depth / 16;
+    for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
+#ifdef USE_NEON
+    {
+      float *__restrict y;
+      y = &output_data[b * output_depth + idx_0 * 16];
+      /* keep y[0..15] in registers for duration of inner loop */
+      float32x4_t y0_3 = vld1q_f32(&y[0]);
+      float32x4_t y4_7 = vld1q_f32(&y[4]);
+      float32x4_t y8_11 = vld1q_f32(&y[8]);
+      float32x4_t y12_15 = vld1q_f32(&y[12]);
+      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        auto idx_1 = w1_indices[pw1];
+        float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
+        float32x4_t wvec;
+
+        wvec = vld1q_f32(&weights_data[0]);
+        y0_3 = vmlaq_f32(y0_3, wvec, xj);
+        wvec = vld1q_f32(&weights_data[4]);
+        y4_7 = vmlaq_f32(y4_7, wvec, xj);
+        wvec = vld1q_f32(&weights_data[8]);
+        y8_11 = vmlaq_f32(y8_11, wvec, xj);
+        wvec = vld1q_f32(&weights_data[12]);
+        y12_15 = vmlaq_f32(y12_15, wvec, xj);
+
+        weights_data += 16;
+      }
+      /* save y[0..15] back to memory */
+      vst1q_f32(&y[0], y0_3);
+      vst1q_f32(&y[4], y4_7);
+      vst1q_f32(&y[8], y8_11);
+      vst1q_f32(&y[12], y12_15);
+    }
+#else
+    {
+      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        float *__restrict y;
+        float xj;
+        auto idx_1 = w1_indices[pw1];
+        xj = input_data[b * accum_depth + idx_1];
+        y = &output_data[b * output_depth + idx_0 * 16];
+        y[0] += weights_data[0] * xj;
+        y[1] += weights_data[1] * xj;
+        y[2] += weights_data[2] * xj;
+        y[3] += weights_data[3] * xj;
+        y[4] += weights_data[4] * xj;
+        y[5] += weights_data[5] * xj;
+        y[6] += weights_data[6] * xj;
+        y[7] += weights_data[7] * xj;
+        y[8] += weights_data[8] * xj;
+        y[9] += weights_data[9] * xj;
+        y[10] += weights_data[10] * xj;
+        y[11] += weights_data[11] * xj;
+        y[12] += weights_data[12] * xj;
+        y[13] += weights_data[13] * xj;
+        y[14] += weights_data[14] * xj;
+        y[15] += weights_data[15] * xj;
+        weights_data += 16;
+      }
+    }
+#endif
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
+  }
+}
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h
new file mode 100644
index 000000000..d17a5796b
--- /dev/null
+++ b/compute/cker/include/cker/operation/FusedBatchNorm.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FUSEDBATCHNORM_H__
+#define __NNFW_CKER_FUSEDBATCHNORM_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/operation/Helper/Tensor.h"
+#include "cker/operation/Helper/MatmulBCast.h"
+
+#include "Transpose.h"
+#include "BatchMatMul.h"
+
+#include <string>
+#include <vector>
+#include <map>
+#include <numeric>
+#include <algorithm>
+
+namespace nnfw
+{
+namespace cker
+{
+
+class FusedBatchNorm
+{
+public:
+  FusedBatchNorm() : _prepared(false)
+  {
+    // DO NOTHING
+  }
+
+  void prepare() { _prepared = true; }
+
+  void operator()(const std::vector<Shape> &input_shapes,
+                  const std::vector<const float *> &input_data, const Shape &output_shape,
+                  float *output_data, FusedBatchNormParams param)
+  {
+    // TODO: support fused_batch_norm if is_traninig is false
+    assert(param.is_training == true);
+
+    // TODO: support case where dim[1] != 1 or dim[3] !=1.
+    // Here we only support input tensor of [B, 1, X, 1] shape
+    assert(input_shapes[0].Dims(1) == 1 && input_shapes[0].Dims(3) == 1);
+
+    if (!_prepared)
+
+    {
+      prepare();
+    }
+
+    Tensor transformed_input[5];
+    Tensor transformed_output;
+
+    const int num_inputs = input_shapes.size();
+    std::vector<InputTensor<float>> inputs(num_inputs);
+    for (int i = 0; i < num_inputs; i++)
+    {
+      inputs[i].shape.ReplaceWith(input_shapes[i].DimensionsCount(), input_shapes[i].DimsData());
+      inputs[i].buffer = input_data[i];
+      copyFrom<float>(inputs[i], inputs[i].shape, &transformed_input[i]);
+    }
+
+    InputTensor<float> output;
+    output.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
+    output.buffer = output_data;
+    copyFrom<float>(output, output.shape, &transformed_output);
+
+    // TODO: support transpose if data_format is NCHW
+    // Here, Eigen use RowMajor kernel(NHWC)
+
+    typename TTypes<float, 4>::Tensor x(transformed_input[0].shaped<float, 4>());
+    typename TTypes<float, 4>::Tensor y(transformed_output.shaped<float, 4>());
+    typename TTypes<float, 1>::Tensor scale(transformed_input[1].shaped<float, 1>());
+    typename TTypes<float, 1>::Tensor offset(transformed_input[2].shaped<float, 1>());
+
+    const int depth = x.dimension(3);
+    const int size = x.size();
+    const int rest_size = size / depth;
+    Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth);
+
+    Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth);
+    Eigen::array<int, 1> reduce_dims({0});
+    Eigen::array<int, 2> bcast_spec({rest_size, 1});
+
+    auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<float>();
+    const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1;
+    float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size));
+    // This adjustment is for Bessel's correction
+    float rest_size_adjust =
+        static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
+
+    Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth);
+    Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth);
+
+    const Eigen::ThreadPoolDevice &d = *eigen_support::GetThreadPoolDevice();
+
+    batch_mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv);
+    auto x_centered = x_rest_by_depth - batch_mean.reshape(one_by_depth).broadcast(bcast_spec);
+
+    batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
+    auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale)
+                              .eval()
+                              .reshape(one_by_depth)
+                              .broadcast(bcast_spec);
+    auto x_scaled = x_centered * scaling_factor;
+    auto x_shifted =
+        (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
+
+    UNUSED_RELEASE(rest_size_adjust);
+
+    y.reshape(rest_by_depth).device(d) = x_shifted;
+
+    memcpy(output_data, y.data(), output_shape.FlatSize() * sizeof(float));
+  }
+
+  template <typename T>
+  void copyFrom(const InputTensor<T> &input, const Shape &shape, Tensor *output)
+  {
+    Tensor temp_tensor;
+    temp_tensor.shape.ReplaceWith(input.shape.DimensionsCount(), input.shape.DimsData());
+    temp_operand.emplace_back(std::make_unique<float[]>(input.shape.FlatSize()));
+    temp_tensor.buffer = temp_operand.back().get();
+    memcpy(temp_tensor.buffer, input.buffer, input.shape.FlatSize() * sizeof(float));
+
+    copyFrom(temp_tensor, shape, output);
+  }
+
+  void copyFrom(const Tensor &input, const Shape &shape, Tensor *output)
+  {
+    if (output->copyFrom(input, shape))
+      return;
+
+    throw std::runtime_error{"Einsum: Encountered error while reshaping a Tensor"};
+  }
+
+private:
+  bool _prepared;
+  std::vector<std::unique_ptr<float[]>> temp_operand;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FUSEDBATCHNORM_H__
diff --git a/compute/cker/include/cker/operation/Gather.h b/compute/cker/include/cker/operation/Gather.h
new file mode 100644
index 000000000..65a71887e
--- /dev/null
+++ b/compute/cker/include/cker/operation/Gather.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_GATHER_H__
+#define __NNFW_CKER_GATHER_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T, typename CoordsT = int32_t>
+inline void Gather(const GatherParams &op_params, const Shape &input_shape, const T *input_data,
+                   const Shape &coords_shape, const CoordsT *coords_data, const Shape &,
+                   T *output_data)
+{
+  int axis = op_params.axis;
+  if (axis < 0)
+  {
+    axis += input_shape.DimensionsCount();
+  }
+  assert(axis >= 0);
+  assert(axis < input_shape.DimensionsCount());
+  const int axis_size = input_shape.Dims(axis);
+  const int coords_count = coords_shape.FlatSize();
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    inner_size *= input_shape.Dims(i);
+  }
+
+  for (int outer = 0; outer < outer_size; ++outer)
+  {
+    for (int i = 0; i < coords_count; ++i)
+    {
+      assert(coords_data[i] >= 0);
+      assert(coords_data[i] < axis_size);
+      std::memcpy(output_data + (outer * coords_count + i) * inner_size,
+                  input_data + (outer * axis_size + coords_data[i]) * inner_size,
+                  sizeof(T) * inner_size);
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_GATHER_H__
diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h
new file mode 100644
index 000000000..a0abf2935
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/BCast.h
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_BCAST_H__
+#define __NNFW_CKER_HELPER_BCAST_H__
+
+/**
+ * ToDo : This file will be moved into upper folder when integrate with other
+ *        custom operations.
+ *        And It should merged with EinsumHelper's BCast.
+**/
+
+#include "cker/Shape.h"
+#include "cker/eigen/EigenSupport.h"
+
+namespace nnfw
+{
+namespace cker
+{
+// Returns the mapping from the output batch indices to the corresponding
+// input's batch indices, given the input's "reshape" and "bcast" shapes as
+// returned by the BCastList helper class. The i'th element denotes the
+// (flattened) batch index of the input that must be used to compute the i'th
+// batch output.
+//
+inline void ComputeBatchIndices(const int32_t output_batch_size,
+                                const std::vector<int32_t> &reshape,
+                                const std::vector<int32_t> &bcast,
+                                std::vector<int32_t> *out_indices)
+{
+  // Populates the mapping in out_indices. This algorithm is identical to
+  // the following steps:
+  //  - Reshape {0, 1, ..., input_batch_size - 1} to the input shape.
+  //  - Broadcast to the output shape.
+  //  - Reshape back to a flat 1D vector.
+  out_indices->resize(output_batch_size);
+  int32_t num_output_elements = 1;
+  int32_t num_input_elements = 1;
+  for (int32_t i = reshape.size() - 1; i >= 0; --i)
+  {
+    // Replicate the already populated mapping an additional (dim - 1) times.
+    // If we are broadcasting, just copy the existing mapping.
+    // Otherwise, add another dimension from the input shape.
+    const int32_t dim = std::max(reshape[i], bcast[i]);
+    const int32_t incr = bcast[i] > 1 ? 0 : num_input_elements;
+    for (int32_t k = 0; k < (dim - 1) * num_output_elements; ++k)
+    {
+      (*out_indices)[num_output_elements + k] = (*out_indices)[k] + incr;
+    }
+    num_output_elements *= dim;
+    num_input_elements *= reshape[i];
+  }
+}
+
+template <int N> class BCastList
+{
+public:
+  // A vector of int32_t representing the shape of tensor. The 0-th
+  // element is the outer-most dimension and the last element is the
+  // inner-most dimension. Note that we do not use Shape since
+  // it's more convenient to manipulate Vec directly for this module.
+  typedef std::vector<int32_t> Vec;
+
+  // Constructs all helper shapes, following the aforementioned rules.
+  //
+  // If "fewer_dims_optimization" is set to true (the default), the
+  // implementation tries to reduce intermediate dimensions needed to be more
+  // efficient.  This is transparent to the caller.
+  //
+  // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have
+  // the same number of dimensions as the larger of the two inputs.
+  //
+  // If return_flattened_batch_indices is true, the implementation will compute
+  // for each output member of the flattened output, which batch indices of
+  // each input correspond to it. This is disabled by default.
+  explicit BCastList(const Vec (&x)[N], const bool fewer_dims_optimization = true,
+                     const bool return_flattened_batch_indices = false);
+  ~BCastList() {}
+
+  // Returns true iff two operands are compatible according to the
+  // broadcasting rule.
+  bool IsValid() const { return valid_; }
+  bool IsBroadcastingRequired() const { return broadcasting_required_; }
+
+  // If and only if IsValid(), the following fields can be used in
+  // implementing a broadcasted binary tensor operation according to
+  // the broadcasting rule.
+  const Vec &reshape(int i) const { return reshape_[i]; }
+  const Vec &bcast(int i) const { return bcast_[i]; }
+  const Vec &result_shape() const { return result_; }
+  const Vec &output_shape() const { return output_; }
+  const Vec &grad_reduce_idx(int i) const { return grad_reduce_idx_[i]; }
+  int32_t output_batch_size() const { return output_batch_size_; }
+
+  // Returns the mapping from the flattened output batch indices to x's
+  // flattened batch indices. The result is a vector of length
+  // output_batch_size(). To compute the i'th batch output, a binary matmul-like
+  // operation should use the `x_batch_indices()[i]`th batch index of `x`.
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int32_t> &batch_indices(int i) const { return batch_indices_[i]; }
+
+protected:
+  bool valid_ = true;
+  bool broadcasting_required_ = true;
+  Vec reshape_[N];
+  Vec bcast_[N];
+  Vec result_;
+  Vec output_;
+  Vec grad_reduce_idx_[N];
+
+  int32_t output_batch_size_;
+  std::vector<int32_t> batch_indices_[N];
+
+  static void Reverse(Vec *shape) { std::reverse(shape->begin(), shape->end()); }
+}; //  BCastList<N>
+
+template <int N>
+BCastList<N>::BCastList(const BCastList::Vec (&x)[N], const bool fewer_dims_optimization,
+                        const bool return_flattened_batch_indices)
+{
+  typedef BCastList::Vec Vec;
+  bool all_equal = true;
+  size_t largest_rank = 0;
+  output_batch_size_ = 1;
+  for (int i = 0; i < N; ++i)
+  {
+    if (x[i] != x[0])
+    {
+      all_equal = false;
+    }
+    if (x[i].size() > largest_rank)
+    {
+      largest_rank = x[i].size();
+    }
+  }
+  if (all_equal)
+  {
+    broadcasting_required_ = false;
+  }
+  if (all_equal && fewer_dims_optimization)
+  {
+    // Fast path for common case of identical shapes.
+    int32_t elements = 1;
+    const int rank = x[0].size();
+    output_.resize(rank);
+    for (int i = 0; i < rank; i++)
+    {
+      const int32_t dim = x[0][i];
+      elements *= dim;
+      output_[i] = dim;
+    }
+    result_.push_back(elements);
+    output_batch_size_ = elements;
+    for (int i = 0; i < N; ++i)
+    {
+      reshape_[i].push_back(elements);
+      bcast_[i].push_back(1);
+    }
+    // grad_reduce_ is left as empty
+    return;
+  }
+
+  // Reverse all the shapes for convenience
+  // After the reverse, 0-th is the inner-most dimension.
+  Vec copy[N];
+  for (int i = 0; i < N; ++i)
+  {
+    copy[i] = x[i];
+    Reverse(&copy[i]);
+  }
+
+  // 1-extend and align all vectors.
+  for (int i = 0; i < N; ++i)
+  {
+    if (copy[i].size() < largest_rank)
+    {
+      copy[i].resize(largest_rank, 1);
+    }
+  }
+  // Going through each dimension starting from the inner-most
+  // dimension, compares dimension of x and y. They are compatible if
+  // they are equal or either is 1.
+
+  // indices of j-th component of each input.
+  bool prev_is_one[N];
+  bool current_is_one[N];
+  for (int i = 0; i < N; ++i)
+  {
+    prev_is_one[i] = false;
+    current_is_one[i] = false;
+  }
+  Vec output;
+  bool output_dim_set = false;
+  int output_dim = -1;
+  bool none_is_one = true;
+  bool set_one = false;
+  for (size_t j = 0; j < largest_rank; ++j)
+  {
+    output_dim = -1;
+    output_dim_set = false;
+    none_is_one = true;
+    // Find which indices are 1.
+    for (int i = 0; i < N; ++i)
+    {
+      // Keep track of which indices are 1.
+      if (copy[i][j] == 1)
+      {
+        current_is_one[i] = true;
+        none_is_one = false;
+      }
+      else
+      {
+        current_is_one[i] = false;
+        if (!output_dim_set || copy[i][j] == output_dim)
+        {
+          output_dim = copy[i][j];
+          output_dim_set = true;
+        }
+        else
+        {
+          valid_ = false;
+          return;
+        }
+      }
+    }
+    output_.push_back(output_dim_set ? output_dim : 1);
+    output_batch_size_ *= output_.back();
+    // All dimensions are 1.
+    if (!output_dim_set)
+    {
+      if (!fewer_dims_optimization)
+      {
+        for (int i = 0; i < N; ++i)
+        {
+          bcast_[i].push_back(1);
+          reshape_[i].push_back(1);
+        }
+        result_.push_back(1);
+      }
+      for (int i = 0; i < N; ++i)
+      {
+        grad_reduce_idx_[i].push_back(largest_rank - 1 - j);
+      }
+      // This will skip updating the previous state to the current one. We'll
+      // explain why this is safe below.
+      // Consider the previous state P, current state C and the next state N.
+      // In the case where N also is all ones (N == C), we'll do the same
+      // optimization here (push back one dimensions if we need to), which is
+      // safe and is expected.
+      //
+      // When N != C, we'll continue as usual. However, we might trigger the
+      // next block if N == P (because we didn't update the previous state).
+      // We trigger the next block if `fewer_dims_optimization` is true.
+      // This means that we did not modify and broadcast / reshapes in this
+      // block (we skipped updating, since the one dimensions can be ignored).
+      // In essence, we only need to check whether the previous non-one state is
+      // equal to the current non-one state.
+
+      continue;
+    }
+    else if ((fewer_dims_optimization) &&
+             std::equal(current_is_one, current_is_one + N, prev_is_one) && set_one)
+    {
+      // It is a run of the same broadcasting case as last time.
+      // We can reshape the input so that fewer dimensions
+      // are involved in the intermediate computation.
+      result_.back() *= output_dim;
+      for (int i = 0; i < N; ++i)
+      {
+        reshape_[i].back() *= copy[i][j];
+        bcast_[i].back() *= current_is_one[i] ? output_dim : 1;
+        if (current_is_one[i] && !none_is_one)
+        {
+          grad_reduce_idx_[i].push_back(largest_rank - 1 - j);
+        }
+      }
+    }
+    else
+    {
+      result_.push_back(output_dim);
+      for (int i = 0; i < N; ++i)
+      {
+        reshape_[i].push_back(copy[i][j]);
+        bcast_[i].push_back(current_is_one[i] ? output_dim : 1);
+        if (current_is_one[i] && !none_is_one)
+        {
+          grad_reduce_idx_[i].push_back(largest_rank - 1 - j);
+        }
+      }
+    }
+    set_one = true;
+    for (int i = 0; i < N; ++i)
+    {
+      prev_is_one[i] = current_is_one[i];
+    }
+  }
+  if (result_.empty())
+  {
+    result_.push_back(1);
+    for (int i = 0; i < N; ++i)
+    {
+      reshape_[i].push_back(1);
+      bcast_[i].push_back(1);
+    }
+  }
+  // Do something about batches.
+  for (int i = 0; i < N; ++i)
+  {
+    Reverse(&reshape_[i]);
+    Reverse(&bcast_[i]);
+    Reverse(&grad_reduce_idx_[i]);
+  }
+  Reverse(&result_);
+  Reverse(&output_);
+  // Only compute batch indices when we need broadcasting, and we aren't doing
+  // needless work (when the output size is 0 or the
+  // return_flattened_batch_indices isn't enabled).
+  if (return_flattened_batch_indices && broadcasting_required_ && output_batch_size_ > 0)
+  {
+    for (int i = 0; i < N; ++i)
+    {
+      ComputeBatchIndices(output_batch_size_, reshape_[i], bcast_[i], &batch_indices_[i]);
+    }
+  }
+}
+
+// BCast is a helper for broadcasting binary tensor operation.
+// TensorFlow's broadcasting rule follows that of numpy (See
+// http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+//
+// The rule has the following properties:
+//
+//   1. suffix matching: the rule starts with the right-most
+//      dimension, and works towards the left-most dimension. Since
+//      TensorFlow is row-major, the right-most dimension (the last
+//      element in the shape of a tensor) is the inner-most, a.k.a.
+//      the fastest changing, dimension.
+//
+//   2. Two dimensions are compatible for broadcasting if both are the
+//      same or either is 1.
+//
+// BCast takes the shape of two tensors and computes a few vectors of
+// int32 that are useful for the caller to reshape the tensors, apply
+// the right broadcasts to them, compute the broadcasted operation,
+// and possibly the gradients. In a nutshell, the caller is expected
+// to compute the broadcasted operation as following:
+//
+//   BCast b(x.shape(), y.shape());
+//   output = x.reshape(b.x_reshape()).broadcast(b.x_bcast())
+//            _op_
+//            y.reshape(b.y_reshape()).broadcast(b.y_bcast())
+//
+// For the gradient computation,
+//   grad_x = sum(grad * backprop_x(x, y), grad_x_reduce_idx)
+//            .reshape(x.shape())
+//   grad_y = sum(grad * backprop_y(x, y), grad_y_reduce_idx)
+//            .reshape(y.shape())
+// backprop_x and backprop_y are functionals of the binary function "op",
+// e.g.,
+//   for +, backprop_x(x, y) = backprop_y(x, y) = 1;
+//   for *, backprop_x(x, y) =  y, backprop_y(x, y) = x;
+//   for /, backprop_x(x, y) = 1/y, backprop_y(x, y) = -x/y^2;
+//
+// The multiplication in the grad * backprop_x itself is also
+// broadcasting following the same rule.
+class BCast : public BCastList<2>
+{
+public:
+  // Constructs all helper shapes, following the aforementioned rules.
+  //
+  // If "fewer_dims_optimization" is set to true (the default), the
+  // implementation tries to reduce intermediate dimensions needed to be more
+  // efficient.  This is transparent to the caller.
+  //
+  // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have
+  // the same number of dimensions as the larger of the two inputs.
+  typedef std::vector<int32_t> Vec;
+
+  BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true,
+        const bool return_flattened_batch_indices = false)
+      : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
+  {
+  }
+
+  ~BCast() {}
+
+  // If and only if IsValid(), the following fields can be used in
+  // implementing a broadcasted binary tensor operation according to
+  // the broadcasting rule.
+  const Vec &x_reshape() const { return reshape_[0]; }
+  const Vec &x_bcast() const { return bcast_[0]; }
+  const Vec &y_reshape() const { return reshape_[1]; }
+  const Vec &y_bcast() const { return bcast_[1]; }
+  const Vec &result_shape() const { return result_; }
+  const Vec &output_shape() const { return output_; }
+  const Vec &grad_x_reduce_idx() const { return grad_reduce_idx_[0]; }
+  const Vec &grad_y_reduce_idx() const { return grad_reduce_idx_[1]; }
+
+  // Returns the mapping from the flattened output batch indices to x's
+  // flattened batch indices. The result is a vector of length
+  // output_batch_size(). To compute the i'th batch output, a binary matmul-like
+  // operation should use the `x_batch_indices()[i]`th batch index of `x`.
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int32_t> &x_batch_indices() const { return batch_indices_[0]; }
+  // Returns the mapping from the flattened output batch indices to y's
+  // flattened batch indices. Similar to x_batch_indices().
+  // Note: Returns an empty vector if broadcasting is not required. Callers
+  // should only use this when IsBroadcastingRequired() returns true.
+  const std::vector<int32_t> &y_batch_indices() const { return batch_indices_[1]; }
+
+  template <typename IndexType, int NDIMS>
+  static Eigen::array<IndexType, NDIMS> ToIndexArrayType(const BCast::Vec &vec)
+  {
+    assert(vec.size() == NDIMS);
+    Eigen::array<IndexType, NDIMS> ret;
+    for (int i = 0; i < NDIMS; ++i)
+      ret[i] = vec[i];
+    return ret;
+  }
+
+  template <int NDIMS>
+  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(const BCast::Vec &vec)
+  {
+    return ToIndexArrayType<Eigen::DenseIndex, NDIMS>(vec);
+  }
+
+  // Static helpers.
+  static Vec FromShape(const Shape &shape)
+  {
+    const int N = shape.DimensionsCount();
+    BCastList::Vec ret(N);
+    for (int i = 0; i < N; ++i)
+    {
+      ret[i] = shape.Dims(i);
+    }
+    return ret;
+  }
+
+  static Shape ToShape(const BCastList::Vec &vec)
+  {
+    const int N = vec.size();
+    Shape shape(N);
+
+    for (int i = 0; i < N; ++i)
+    {
+      shape.SetDim(i, vec[i]);
+    }
+    return shape;
+  }
+
+}; // BCast
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_HELPER_BCAST_H__
diff --git a/compute/cker/include/cker/operation/Helper/MatmulBCast.h b/compute/cker/include/cker/operation/Helper/MatmulBCast.h
new file mode 100644
index 000000000..b80ccc0d0
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/MatmulBCast.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EINSUM_HELPER_MATMUL_BCAST_H__
+#define __NNFW_CKER_EINSUM_HELPER_MATMUL_BCAST_H__
+
+#include <vector>
+#include <memory>
+#include <numeric>
+
+#include "BCast.h"
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// Simple wrapper over BCast specialized for MatMul.
+// Provides utilities for broadcasting across batch dimensions for binary
+// MatMul-like operations.
+
+// Fix: Use Shape directly instead of Vec
+class MatMulBCast
+{
+public:
+  MatMulBCast(Shape &shape_x, Shape &shape_y)
+  {
+    if (shape_x.DimensionsCount() < 2 || shape_y.DimensionsCount() < 2)
+      return;
+
+    std::vector<int32_t> x;
+    std::vector<int32_t> y;
+
+    x.resize(shape_x.DimensionsCount() - 2);
+    y.resize(shape_y.DimensionsCount() - 2);
+
+    for (size_t i = 0; i < x.size(); i++)
+    {
+      x[i] = shape_x.Dims(i);
+    }
+    for (size_t i = 0; i < y.size(); i++)
+    {
+      y[i] = shape_y.Dims(i);
+    }
+
+    _batch_bcast = std::make_unique<BCast>(std::move(x), std::move(y));
+    if (!_batch_bcast->IsValid())
+      return;
+
+    auto x_reshaped = _batch_bcast->x_reshape();
+    auto y_reshaped = _batch_bcast->y_reshape();
+    auto output_shape = _batch_bcast->output_shape();
+
+    _x_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1),
+                                    std::multiplies<int32_t>());
+    _y_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1),
+                                    std::multiplies<int32_t>());
+    _output_shape.ReplaceWith(output_shape.size(), output_shape.data());
+    _output_batch_size = _output_shape.FlatSize();
+  }
+
+  bool IsValid() const { return (_batch_bcast != nullptr) && _batch_bcast->IsValid(); }
+  int32_t x_batch_size() const { return _x_batch_size; }
+  int32_t y_batch_size() const { return _y_batch_size; }
+  int32_t output_batch_size() const { return _output_batch_size; }
+  const Shape &output_batch_shape() const { return _output_shape; }
+
+private:
+  std::unique_ptr<BCast> _batch_bcast;
+
+  int32_t _x_batch_size;
+  int32_t _y_batch_size;
+  Shape _output_shape;
+  int32_t _output_batch_size;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EINSUM_HELPER_MATMUL_BCAST_H__
diff --git a/compute/cker/include/cker/operation/Helper/PhiloxRandom.h b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h
new file mode 100644
index 000000000..8e8879ce9
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+
+#include <stdlib.h>
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+// Function qualifiers that need to work on both CPU and GPU.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// For nvcc.
+#define PHILOX_DEVICE_FUNC __host__ __device__
+#define PHILOX_INLINE __inline__
+#else
+// For non-nvcc.
+#define PHILOX_DEVICE_FUNC
+#define PHILOX_INLINE inline
+#endif
+#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
+
+#include <math.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace random
+{
+
+// A class that represents an inline array. It can be used on both CPU and GPU,
+// and also trivially copyable between CPU and GPU.
+// Arguments:
+//   T: the array element type;
+//   ElementCount: the fixed size of the array;
+template <typename T, int ElementCount> class Array
+{
+public:
+  static constexpr int kElementCount = ElementCount;
+  PHILOX_DEVICE_INLINE Array()
+  {
+    for (int i = 0; i < ElementCount; ++i)
+    {
+      data_[i] = T(0);
+    }
+  }
+
+  PHILOX_DEVICE_INLINE const T &operator[](int index) const { return data_[index]; }
+
+  PHILOX_DEVICE_INLINE T &operator[](int index) { return data_[index]; }
+
+  size_t size() const { return ElementCount; }
+
+private:
+  T data_[ElementCount];
+};
+
+// A class that encapsulates all the states for a random number generator using
+// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
+// in the form of four uint32.
+// There are multiple variants of this algorithm, we picked the 4x32_10 version
+// that is most suited for our applications.
+// Since this class is meant to be copied between CPU to GPU, it maintains a
+// value semantics.
+//
+// For example: To use this class and populate an array of 1024 randoms on CPU
+// with two threads,
+//
+//  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
+//    assert(start % 4 == 0);
+//    assert(limit % 4 == 0);
+//    rnd.Skip(start / 4);
+//    for (int i = start; i < limit; i += 4) {
+//      auto sample = rnd();
+//      ... copy sample[0..3] to output[i..i+3]
+//    }
+//  }
+//
+//  PhiloxRandom rng(seed);
+//  PhiloxRandom rng_copy = rng;
+//  rng.Skip(1000/4);
+//
+//  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
+//  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
+//  ... wait for thread 1 & 2 to finish executing Fill().
+//
+// NOTE:
+// 1. PhiloxRandom is trivially copyable.
+// 2. PhiloxRandom is compilable by gcc and nvcc.
+class PhiloxRandom
+{
+public:
+  using ResultType = Array<uint32_t, 4>;
+  using ResultElementType = uint32_t;
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 4;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 10;
+  // The type for the 64-bit key stored in the form of two 32-bit uint
+  // that are used in the diffusion process.
+  using Key = Array<uint32_t, 2>;
+
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom() {}
+
+  PHILOX_DEVICE_INLINE
+  explicit PhiloxRandom(uint64_t seed)
+  {
+    key_[0] = static_cast<uint32_t>(seed);
+    key_[1] = static_cast<uint32_t>(seed >> 32);
+  }
+
+  PHILOX_DEVICE_INLINE
+  explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi)
+  {
+    key_[0] = static_cast<uint32_t>(seed_lo);
+    key_[1] = static_cast<uint32_t>(seed_lo >> 32);
+    counter_[2] = static_cast<uint32_t>(seed_hi);
+    counter_[3] = static_cast<uint32_t>(seed_hi >> 32);
+  }
+
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType const &counter() const { return counter_; }
+
+  PHILOX_DEVICE_INLINE
+  Key const &key() const { return key_; }
+
+  // Skip the specified number of samples of 128-bits in the current stream.
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64_t count)
+  {
+    const uint32_t count_lo = static_cast<uint32_t>(count);
+    uint32_t count_hi = static_cast<uint32_t>(count >> 32);
+
+    counter_[0] += count_lo;
+    if (counter_[0] < count_lo)
+    {
+      ++count_hi;
+    }
+
+    counter_[1] += count_hi;
+    if (counter_[1] < count_hi)
+    {
+      if (++counter_[2] == 0)
+      {
+        ++counter_[3];
+      }
+    }
+  }
+
+  // Returns a group of four random numbers using the underlying Philox
+  // algorithm.
+  PHILOX_DEVICE_INLINE ResultType operator()()
+  {
+    ResultType counter = counter_;
+    Key key = key_;
+
+    // Run the single rounds for ten times. Manually unrolling the loop
+    // for better performance.
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+
+    SkipOne();
+
+    return counter;
+  }
+
+private:
+  // We use the same constants as recommended by the original paper.
+  static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
+  static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
+  static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
+
+  // Helper function to skip the next sample of 128-bits in the current stream.
+  PHILOX_DEVICE_INLINE void SkipOne()
+  {
+    if (++counter_[0] == 0)
+    {
+      if (++counter_[1] == 0)
+      {
+        if (++counter_[2] == 0)
+        {
+          ++counter_[3];
+        }
+      }
+    }
+  }
+
+  // Helper function to return the lower and higher 32-bits from two 32-bit
+  // integer multiplications.
+  PHILOX_DEVICE_INLINE
+  static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high)
+  {
+#ifndef __CUDA_ARCH__
+    const uint64_t product = static_cast<uint64_t>(a) * b;
+    *result_low = static_cast<uint32_t>(product);
+    *result_high = static_cast<uint32_t>(product >> 32);
+#else
+    *result_low = a * b;
+    *result_high = __umulhi(a, b);
+#endif
+  }
+
+  // Helper function for a single round of the underlying Philox algorithm.
+  PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(const ResultType &counter,
+                                                            const Key &key)
+  {
+    uint32_t lo0;
+    uint32_t hi0;
+    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
+
+    uint32_t lo1;
+    uint32_t hi1;
+    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
+
+    ResultType result;
+    result[0] = hi1 ^ counter[1] ^ key[0];
+    result[1] = lo1;
+    result[2] = hi0 ^ counter[3] ^ key[1];
+    result[3] = lo0;
+    return result;
+  }
+
+  PHILOX_DEVICE_INLINE void RaiseKey(Key *key)
+  {
+    (*key)[0] += kPhiloxW32A;
+    (*key)[1] += kPhiloxW32B;
+  }
+
+private:
+  ResultType counter_;
+  Key key_;
+};
+
+} // namespace random
+} // namespace cker
+} // namespace nnfw
+#endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
new file mode 100644
index 000000000..baeafd7c9
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
+#define __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
+
+#include <string.h>
+
+#include <cmath>
+
+#include <algorithm>
+#include <type_traits>
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+#include "cker/operation/Helper/PhiloxRandom.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace random
+{
+
+// Helper function to convert a 16-bit integer to a half between [0..1).
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x);
+// Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
+// PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
+// Helper function to convert a 32-bit integer to a float between [0..1).
+PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x);
+// Helper function to convert two 32-bit integers to a double between [0..1).
+PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1);
+
+// Computes a + b. Requires that the result is representable in the destination
+// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
+// need *not* be representable in that type. (The condition on b excludes the
+// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
+// compute.)
+template <typename Int>
+PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b)
+{
+  // Implementation note: both b_div_2 and b - b_div_2 are positive and
+  // representable as Int.
+  auto b_div_2 = b >> 1;
+  return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
+}
+
+// A class that generates uniform distribution random numbers from the
+// underlying random integer generator.
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for the
+//              actual returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class Generator, typename RealType> class UniformDistribution;
+
+template <class Generator> class UniformDistribution<Generator, Eigen::half>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
+    }
+    return result;
+  }
+};
+
+template <class Generator> class UniformDistribution<Generator, float>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = Uint32ToFloat(sample[i]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class UniformDistribution<Generator, double>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class UniformDistribution<Generator, int32_t>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<int32_t, kResultElementCount> ResultType;
+  typedef int32_t ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int32_t lo, int32_t hi)
+      : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
+  {
+  }
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = SignedAdd(lo_, sample[i] % range_);
+    }
+    return result;
+  }
+
+private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int32_t lo_;
+  int32_t range_;
+};
+
+template <class Generator> class UniformDistribution<Generator, int64_t>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<int64_t, kResultElementCount> ResultType;
+  typedef int64_t ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int64_t lo, int64_t hi)
+      : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
+  {
+  }
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
+      result[i] = SignedAdd(lo_, bits % range_);
+    }
+    return result;
+  }
+
+private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int64_t lo_;
+  uint64_t range_;
+};
+
+// Similar to `UniformDistribution`, except that instead of generating numbers
+// in the range [low, high), it generates numbers covering the whole range of
+// the integer type.
+template <typename Generator, typename IntType> class UniformFullIntDistribution;
+
+template <typename Generator, typename IntType> class UniformFullIntDistribution32
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<IntType, kResultElementCount> ResultType;
+  typedef IntType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = sample[i];
+    }
+    return result;
+  }
+};
+
+template <typename Generator, typename IntType> class UniformFullIntDistribution64
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<IntType, kResultElementCount> ResultType;
+  typedef IntType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
+    }
+    return result;
+  }
+};
+
+template <typename Generator>
+class UniformFullIntDistribution<Generator, int32_t>
+    : public UniformFullIntDistribution32<Generator, int32_t>
+{
+};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, uint32_t>
+    : public UniformFullIntDistribution32<Generator, uint32_t>
+{
+};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, int64_t>
+    : public UniformFullIntDistribution64<Generator, int64_t>
+{
+};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, uint64_t>
+    : public UniformFullIntDistribution64<Generator, uint64_t>
+{
+};
+
+// A class that adapts the underlying native multiple samples to return a single
+// sample at a time.
+template <class Generator> class SingleSampleAdapter
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 1;
+  // The number of elements that will be returned by the underlying generator.
+  static constexpr int kNativeElementCount = Generator::kResultElementCount;
+  typedef typename Generator::ResultElementType ResultType;
+  typedef typename Generator::ResultElementType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  explicit SingleSampleAdapter(Generator *gen)
+      : generator_(gen), used_result_index_(Generator::kResultElementCount)
+  {
+  }
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()()
+  {
+    if (used_result_index_ == Generator::kResultElementCount)
+    {
+      unused_results_ = (*generator_)();
+      used_result_index_ = 0;
+    }
+
+    return unused_results_[used_result_index_++];
+  }
+
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64_t num_skips)
+  {
+    if (!num_skips)
+    {
+      return;
+    }
+    int num_unused_results = kNativeElementCount - used_result_index_;
+    if (num_skips <= num_unused_results)
+    {
+      used_result_index_ += num_skips;
+      return;
+    }
+    num_skips -= num_unused_results;
+    used_result_index_ = kNativeElementCount;
+    SkipFromGenerator(num_skips / kNativeElementCount);
+    num_skips = num_skips % kNativeElementCount;
+    if (num_skips)
+    {
+      unused_results_ = (*generator_)();
+      used_result_index_ = num_skips;
+    }
+  }
+
+private:
+  // This implementation iteratively skips over `num_skips` samples
+  // from `generator_`. There is an O(1) implementation for PhiloxRandom
+  // in random_distributions.cc.
+  PHILOX_DEVICE_INLINE
+  void SkipFromGenerator(uint64_t num_skips)
+  {
+    while (num_skips--)
+    {
+      (*generator_)();
+    }
+  }
+
+  Generator *generator_;
+  typename Generator::ResultType unused_results_;
+  int used_result_index_;
+};
+
+// A class that generates unit normal distribution random numbers from the
+// underlying random integer generator.
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              each invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for actual
+//              returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class Generator, typename RealType> class NormalDistribution;
+
+PHILOX_DEVICE_INLINE
+void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1);
+
+PHILOX_DEVICE_INLINE
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1);
+
+// Exactly like the float version, except that we convert to half afterwards;
+// since we don't have half-precision sin/cos even on GPUs, there's nothing to
+// gain from working in half internally.
+template <class Generator> class NormalDistribution<Generator, Eigen::half>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2)
+    {
+      float f[2];
+      BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
+      result[i] = Eigen::half(f[0]);
+      result[i + 1] = Eigen::half(f[1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class NormalDistribution<Generator, float>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2)
+    {
+      BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class NormalDistribution<Generator, double>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2)
+    {
+      const int i2 = 2 * i;
+      BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
+                      &result[i + 1]);
+    }
+    return result;
+  }
+};
+
+// A class that returns standard normal distribution between
+// [-kTruncateValue, kTruncateValue].
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              each invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for actual
+//              returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class SingleSampleGenerator, typename RealType> class TruncatedNormalDistribution;
+
+// Exactly like the float version, except that we convert to half afterwards;
+// since we don't have half-precision sin/cos even on GPUs, there's nothing to
+// gain from working in half internally.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator *gen)
+  {
+    ResultType results;
+    int index = 0;
+    while (true)
+    {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue)
+      {
+        results[index++] = Eigen::half(f[0]);
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue)
+      {
+        results[index++] = Eigen::half(f[1]);
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for float.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, float>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator *gen)
+  {
+    ResultType results;
+    int index = 0;
+    while (true)
+    {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue)
+      {
+        results[index++] = f[0];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue)
+      {
+        results[index++] = f[1];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for double.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, double>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
+                                                 ? SingleSampleGenerator::kNativeElementCount / 2
+                                                 : 1;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+  const double kTruncateValue = 2.0;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator *gen)
+  {
+    ResultType results;
+    int index = 0;
+    while (1)
+    {
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      const uint32_t x2 = (*gen)();
+      const uint32_t x3 = (*gen)();
+      double d[2];
+      BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
+
+      if (Eigen::numext::abs(d[0]) < kTruncateValue)
+      {
+        results[index++] = d[0];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(d[1]) < kTruncateValue)
+      {
+        results[index++] = d[1];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Helper function to convert two 32-bit uniform integers to two floats
+// under the unit normal distribution.
+PHILOX_DEVICE_INLINE
+void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1)
+{
+  // This function implements the Box-Muller transform:
+  // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
+  // Do not send a really small number to log().
+  // We cannot mark "epsilon" as "static const" because NVCC would complain
+  const float epsilon = 1.0e-7f;
+  float u1 = Uint32ToFloat(x0);
+  if (u1 < epsilon)
+  {
+    u1 = epsilon;
+  }
+  const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
+  const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
+#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+  *f0 = Eigen::numext::sin(v1);
+  *f1 = Eigen::numext::cos(v1);
+#else
+  sincosf(v1, f0, f1);
+#endif
+  *f0 *= u2;
+  *f1 *= u2;
+}
+
+// Helper function to convert four 32-bit uniform integers to two doubles
+// under the unit normal distribution.
+PHILOX_DEVICE_INLINE
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1)
+{
+  // This function implements the Box-Muller transform:
+  // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
+  // Do not send a really small number to log().
+  // We cannot mark "epsilon" as "static const" because NVCC would complain
+  const double epsilon = 1.0e-7;
+  double u1 = Uint64ToDouble(x0, x1);
+  if (u1 < epsilon)
+  {
+    u1 = epsilon;
+  }
+  const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
+  const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
+#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+  *d0 = Eigen::numext::sin(v1);
+  *d1 = Eigen::numext::cos(v1);
+#else
+  sincos(v1, d0, d1);
+#endif
+  *d0 *= u2;
+  *d1 *= u2;
+}
+
+// Helper function to convert an 16-bit integer to a half between [0..1).
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x)
+{
+  // IEEE754 halfs are formatted as follows (MSB first):
+  //    sign(1) exponent(5) mantissa(10)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 15  -- an excess 15 representation of a zero exponent
+  //    mantissa == 10 random bits
+  const uint16_t man = x & 0x3ffu; // 10 bit mantissa
+  const uint16_t exp = static_cast<uint16_t>(15);
+  const uint16_t val = (exp << 10) | man;
+
+  Eigen::half result;
+  result.x = val;
+  return result - Eigen::half(1.0);
+}
+
+// Helper function to convert an 32-bit integer to a float between [0..1).
+PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x)
+{
+  // IEEE754 floats are formatted as follows (MSB first):
+  //    sign(1) exponent(8) mantissa(23)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 127  -- an excess 127 representation of a zero exponent
+  //    mantissa == 23 random bits
+  const uint32_t man = x & 0x7fffffu; // 23 bit mantissa
+  const uint32_t exp = static_cast<uint32_t>(127);
+  const uint32_t val = (exp << 23) | man;
+
+  // Assumes that endian-ness is same for float and uint32.
+  float result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0f;
+}
+
+// Helper function to convert two 32-bit integers to a double between [0..1).
+PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
+{
+  // IEEE754 doubles are formatted as follows (MSB first):
+  //    sign(1) exponent(11) mantissa(52)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 1023  -- an excess 1023 representation of a zero exponent
+  //    mantissa == 52 random bits
+  const uint32_t mhi = x0 & 0xfffffu;                            // upper 20 bits of mantissa
+  const uint32_t mlo = x1;                                       // lower 32 bits of mantissa
+  const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo; // mantissa
+  const uint64_t exp = static_cast<uint64_t>(1023);
+  const uint64_t val = (exp << 52) | man;
+  // Assumes that endian-ness is same for double and uint64.
+  double result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0;
+}
+
+} // namespace random
+} // namespace tensorflow
+}
+
+#endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h
new file mode 100644
index 000000000..7dc51fe94
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/RandomOp.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_RANDOM_OP_H__
+#define __NNFW_CKER_HELPER_RANDOM_OP_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/operation/Helper/RandomDistributions.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace functor
+{
+
+template <typename Device, class Distribution> struct FillPhiloxRandom;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+// Declares the partially CPU-specialized functor struct.
+//
+// NOTE: Due to inlining done by the compiler, you may need to add
+// explicit instantiation of the functor in random_op.cc.  See example
+// functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>.
+template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution>
+{
+  void operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data,
+                  int64_t size, Distribution dist);
+};
+
+} // namespace functor
+} // namespace tensorflow
+}
+#endif // __NNFW_CKER_HELPER_RANDOM_OP_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
new file mode 100644
index 000000000..85d267723
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
+#define __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+
+#include "cker/operation/Helper/PhiloxRandom.h"
+#include "cker/operation/Helper/RandomOp.h"
+#include "cker/operation/Helper/RandomDistributions.h"
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+namespace nnfw
+{
+namespace cker
+{
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace functor
+{
+using random::PhiloxRandom;
+using random::SingleSampleAdapter;
+
+// The default implementation of the functor, which should never be invoked
+// But we still need to provide implementation for now for the linker to work,
+// since we do not support all the distributions yet.
+template <typename Device, class Distribution> struct FillPhiloxRandom
+{
+  typedef typename Distribution::ResultElementType T;
+  void operator()() {}
+};
+
+// A class to fill a specified range of random groups
+template <class Distribution, bool VariableSamplesPerOutput> struct FillPhiloxRandomTask;
+
+// Specialization for distribution that takes a fixed number of samples for
+// each output.
+template <class Distribution> struct FillPhiloxRandomTask<Distribution, false>
+{
+  typedef typename Distribution::ResultElementType T;
+  static void Run(random::PhiloxRandom gen, T *data, int64_t size, Distribution dist)
+  {
+    const int kGroupSize = Distribution::kResultElementCount;
+    gen.Skip(0);
+    int64_t offset = 0;
+
+    // First fill all the full-size groups
+    int64_t limit_group_full = size / kGroupSize;
+    for (int64_t index = 0; index < limit_group_full; ++index)
+    {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    int64_t remaining_size = size - limit_group_full * kGroupSize;
+
+    // If there are any remaining elements that need to be filled, process them
+    if (remaining_size > 0)
+    {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Specialization for distribution that takes a variable number of samples for
+// each output. This will be slower due to the generality.
+template <class Distribution> struct FillPhiloxRandomTask<Distribution, true>
+{
+  typedef typename Distribution::ResultElementType T;
+  static constexpr int64_t kReservedSamplesPerOutput = 256;
+
+  static void Run(random::PhiloxRandom base_gen, T *data, int64_t size, Distribution dist)
+  {
+    const int kGroupSize = Distribution::kResultElementCount;
+    static const int kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
+
+    int64_t offset = 0;
+
+    // First fill all the full-size groups
+    int64_t limit_group_full = size / kGroupSize;
+    int64_t group_index;
+    for (group_index = 0; group_index < limit_group_full; ++group_index)
+    {
+      // Reset the generator to the beginning of the output group region
+      // This is necessary if we want the results to be independent of order
+      // of work
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    int64_t remaining_size = size - limit_group_full * kGroupSize;
+    // If there are any remaining elements that need to be filled, process them
+    if (remaining_size > 0)
+    {
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Partial specialization for CPU to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+void FillPhiloxRandom<CPUDevice, Distribution>::
+operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data, int64_t size,
+           Distribution dist)
+{
+  FillPhiloxRandomTask<Distribution, Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
+                                                                                   dist);
+}
+
+} // namespace functor
+
+} // end namespace tensorflow
+}
+
+#endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h
new file mode 100644
index 000000000..e6ac008a5
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/Tensor.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_TENSOR_H__
+#define __NNFW_CKER_HELPER_TENSOR_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/EigenSupport.h"
+
+namespace nnfw
+{
+namespace cker
+{
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> struct TTypes
+{
+  // Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      Tensor;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      ConstTensor;
+
+  // Unaligned Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
+      UnalignedConstTensor;
+
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned>
+      Tensor32Bit;
+
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Scalar;
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstScalar;
+
+  // Unaligned Scalar tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+      UnalignedScalar;
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+      UnalignedConstScalar;
+
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstVec;
+
+  // Unaligned Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
+      UnalignedConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec;
+
+  // Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstMatrix;
+
+  // Unaligned Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
+      UnalignedConstMatrix;
+};
+
+typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
+
+template <typename T> struct InputTensor
+{
+  Shape shape;
+  const T *buffer;
+};
+
+struct Tensor
+{
+  Shape shape;
+  void *buffer;
+
+public:
+  bool copyFrom(const Tensor &other, const Shape &new_shape)
+  {
+    if (other.shape.FlatSize() != new_shape.FlatSize())
+      return false;
+
+    this->shape.ReplaceWith(new_shape.DimensionsCount(), new_shape.DimsData());
+    this->buffer = other.buffer;
+
+    return true;
+  }
+
+  template <typename T> T *base() const
+  {
+    return buffer == nullptr ? nullptr : reinterpret_cast<T *>(buffer);
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(const std::vector<int32_t> &new_sizes)
+  {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+    for (size_t d = 0; d < NDIMS; d++)
+    {
+      dims[d] = new_sizes[d];
+    }
+    return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
+  }
+
+  template <typename T> typename TTypes<T>::Flat flat() { return shaped<T, 1>({shape.FlatSize()}); }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstTensor shaped(const std::vector<int32_t> new_sizes) const
+  {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+    for (size_t d = 0; d < NDIMS; d++)
+    {
+      dims[d] = new_sizes[d];
+    }
+    return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
+  }
+
+  // Create Eigen Tensor with current shape
+  template <typename T, size_t NDIMS> typename TTypes<T, NDIMS>::Tensor shaped() const
+  {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims;
+    for (size_t d = 0; d < NDIMS; d++)
+    {
+      dims[d] = shape.Dims(d);
+    }
+    return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
+  }
+
+  template <typename T> typename TTypes<T>::ConstFlat flat() const
+  {
+    return shaped<T, 1>({shape.FlatSize()});
+  }
+
+  template <typename T> typename TTypes<T>::ConstScalar scalar() const
+  {
+    return typename TTypes<T>::ConstScalar(base<T>());
+  }
+}; // Tensor
+
+template <typename DSizes> Eigen::DSizes<Index32, DSizes::count> To32BitDims(const DSizes &in)
+{
+  Eigen::DSizes<Index32, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i)
+  {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <typename TensorType>
+typename TTypes<typename TensorType::Scalar, TensorType::NumIndices>::Tensor32Bit
+To32Bit(TensorType in)
+{
+  typedef typename TTypes<typename TensorType::Scalar, TensorType::NumIndices>::Tensor32Bit RetType;
+  return RetType(in.data(), To32BitDims(in.dimensions()));
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_HELPER_TENSOR_H__
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
new file mode 100644
index 000000000..6445e8a2b
--- /dev/null
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_INSTANCE_NORM_H__
+#define __NNFW_CKER_INSTANCE_NORM_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_shape,
+                         const float *input_data, const Shape &gamma_shape, const float *gamma_data,
+                         const Shape &beta_shape, const float *beta_data, const Shape &output_shape,
+                         float *output_data)
+{
+  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
+  const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
+  const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  UNUSED_RELEASE(gamma_shape);
+  UNUSED_RELEASE(beta_shape);
+  assert(output_activation_min <= output_activation_max);
+
+  for (int32_t batch = 0; batch < batches; batch++)
+  {
+    for (int32_t channel = 0; channel < channels; channel++)
+    {
+      double sum = 0.0f;
+      double square_sum = 0.0f;
+      int32_t size = heights * widths;
+
+      for (int32_t height = 0; height < heights; height++)
+      {
+        for (int32_t width = 0; width < widths; width++)
+        {
+          double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
+          sum += input_val;
+          square_sum += (input_val * input_val);
+        }
+      }
+
+      double mean = sum / size;
+      double var = square_sum / size - mean * mean;
+
+      double gamma = gamma_data[channel];
+      double beta = beta_data[channel];
+
+      double a = gamma / (std::sqrt(var + params.epsilon));
+      double b = -mean * a + beta;
+
+      for (int32_t height = 0; height < heights; height++)
+      {
+        for (int32_t width = 0; width < widths; width++)
+        {
+          double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
+          double output_value = input_value * a + b;
+          output_data[Offset(output_shape, batch, height, width, channel)] =
+              ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_INSTANCE_NORM_H__
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
new file mode 100644
index 000000000..a0075c3d0
--- /dev/null
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_L2NORMALIZE_H__
+#define __NNFW_CKER_L2NORMALIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+void L2NormalizeFloat32(const Shape &input_shape, const float *input_data,
+                        const Shape &output_shape, float *output_data)
+{
+  float epsilon = 1e-6;
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c)
+    {
+      const float val = input_data[c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c)
+    {
+      *output_data = *input_data / l2_norm;
+      ++output_data;
+      ++input_data;
+    }
+  }
+}
+
+void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uint8_t *input_data,
+                       const Shape &output_shape, uint8_t *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = params.input_zero_point;
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++)
+    {
+      // Note that input_data advances by depth in the second pass below.
+      int32_t diff = input_data[c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++)
+    {
+      int32_t diff = *input_data - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val = std::min(static_cast<int32_t>(255),
+                                    std::max(static_cast<int32_t>(0), unclamped_output_val));
+      *output_data = static_cast<uint8_t>(output_val);
+      ++input_data;
+      ++output_data;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_L2NORMALIZE_H__
diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h
new file mode 100644
index 000000000..27beaaead
--- /dev/null
+++ b/compute/cker/include/cker/operation/LSTM.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
+#define __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "cker/TensorUtils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// LINT.IfChange
+// Calculates a single LSTM gate.
+//
+// Implements the following formula: (* is matrix multiply)
+//   gate = activate(W_input    * input + W_aux       * aux_input   +
+//                   W_peephole * cell  + W_recurrent * prev_output + bias)
+// with layer norm:
+//   gate = activate(W_norm * normalize(...) + bias) // not adding bias inside
+//
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+//
+// Parameters:
+// Input vectors (to LSTM):    | Size:                | Optional?
+//   input                     | n_input              |
+//   aux_input                 | n_aux_input          | y (bidir LSTM)
+// Input vectors (persistent states):
+//   output_state              | n_output             |
+//   cell_state                | n_cell               |
+// 'Constant' inputs:
+//   input_to_gate_weights     | n_cell * n_input     |
+//   aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM)
+//   recurrent_to_gate_weights | n_cell * n_output    |
+//   cell_to_gate_weights      | n_cell               | y (peephole)
+//   gate_bias                 | n_cell               |
+//   layer_norm_coefficients   | n_cell               | y (layer norm)
+// Output vector:
+//   gate                      | n_cell               |
+// Scalar parameters:
+//   n_batch                                    - batch size / number of vectors
+//   n_input, n_aux_input, n_output, n_cell     - size of vectors.
+//   activation                                 - activation to use.
+//   is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero.
+//   use_layer_norm                             - if doing layer norm LSTM.
+inline void CalculateLstmGateFloat(const float *input, const float *input_to_gate_weights,
+                                   const float *aux_input, const float *aux_input_to_gate_weights,
+                                   const float *output_state,
+                                   const float *recurrent_to_gate_weights, const float *cell_state,
+                                   const float *cell_to_gate_weights,
+                                   const float *layer_norm_coefficients, const float *gate_bias,
+                                   const int n_batch, const int n_input, const int n_aux_input,
+                                   const int n_output, const int n_cell,
+                                   const FusedActivationFunctionType activation, float *gate,
+                                   const bool is_input_all_zeros, const bool is_aux_input_all_zeros)
+{
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm)
+  {
+    std::fill_n(gate, n_cell * n_batch, 0.0f);
+  }
+  else
+  {
+    VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros)
+  {
+    MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch,
+                                        gate, /*result_stride=*/1);
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros)
+  {
+    MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
+                                        n_batch, gate, /*result_stride=*/1);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state,
+                                      n_batch, gate, /*result_stride=*/1);
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole)
+  {
+    VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch,
+                                            gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm)
+  {
+    MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate);
+    VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  ApplyActivationToVector(gate, n_batch * n_cell, activation, gate);
+}
+
+// Updates the LSTM cell state, used by both float and hybrid LSTM versions.
+//
+// Implements the following formula:
+//   cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate)
+//
+// With CIFG LSTM, input gate is replaced by (1-forget_gate).
+//
+// Parameters:
+//  - n_batch, n_cell: sizes of vectors
+//  - cell_state: input/output vector, size n_batch*n_cell
+//  - input_gate: input vector, size n_batch*n_cell.
+//  - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG
+//  - cell_gate: input vector, size n_batch*n_cell.
+//  - use_cifg: use 1-forget_gate instead of input_gate.
+//  - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellFloat(int n_batch, int n_cell, float *cell_state, const float *input_gate,
+                         float *forget_gate, const float *cell_gate, bool use_cifg, float clip)
+{
+  // Define variable for 4th argument to avoid warning
+  // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
+  const float *cwise_product_rhs = cell_state;
+  VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state);
+
+  if (use_cifg)
+  {
+    // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
+    // scratch, as input_gate array is not allocated in this case. (Be careful
+    // not to write to the scratch before reading the forget gate data.)
+    float *scratch = forget_gate;
+    Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state);
+  }
+  else
+  {
+    VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state);
+  }
+  if (clip > 0.0f)
+  {
+    CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
+// Calculates the output state tensor of an LSTM step.
+//
+// Implements the following formula:
+//   output_no_projection = output_gate .* activate(cell_state)
+//     (elementwise vector product)
+// If no projection is used:
+//   output = output_state = output_no_projection
+// With projection:
+//   output = output_state = clip(W*output_no_projection + bias)
+//
+// Output might not have a different 'stride' than n_batch, so we need to copy.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, projection_weights_scale, projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch: scratch area, size n_batch*n_cell.
+void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float *cell_state,
+                              const float *output_gate, FusedActivationFunctionType activation,
+                              const float *projection_weights, const float *projection_bias,
+                              const float proj_clip, float *output_state, float *scratch)
+{
+  ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch);
+
+  // Define variable for 4th argument to avoid warning
+  // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
+  const float *cwise_product_rhs = scratch;
+  VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection)
+  {
+    if (use_projection_bias)
+    {
+      VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state);
+    }
+    else
+    {
+      std::fill_n(output_state, n_batch * n_output, 0.0f);
+    }
+    MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch,
+                                        output_state, /*result_stride=*/1);
+    if (proj_clip > 0.0f)
+    {
+      CwiseClipping(output_state, n_batch * n_output, proj_clip);
+    }
+  }
+  else
+  {
+    std::copy_n(scratch, n_batch * n_output, output_state);
+  }
+}
+
+// Performs an LSTM batch inference step for input specified by input_ptr.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+//  - params: various LSTM params including activation, clipping, etc.,
+//  - n_batch: size of batch,
+//  - n_cell: number of cells (or units),
+//  - n_input: the input size,
+//  - n_aux_input: the auxiliary input size.
+//  - n_output: the output size.
+//  - output_batch_leading_dim: the leading dimension of the output buffer.
+//
+// Input of size 'n_batch * n_input':
+//   input_ptr
+// Input of size 'n_batch * n_aux_input':
+//   aux_input_ptr                     - optional (can be nullptr)
+//
+// LSTM weights:
+// Input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_output_weights
+// Auxiliary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
+// Recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
+// The pointers to the cell and output state and the output are updated.
+//
+// The pointers input_ptr, aux_input_ptr, and output_ptr point to data aligned
+// in batch_major order, and each step processes batch_size many inputs from
+// input_ptr, and updates batch_size many cell and output states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to n_output. It is usually not
+// when we want to store the LSTM output into a slice of the output tensor, e.g.
+// for bidirectional LSTMs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+// LINT.IfChange
+inline void LstmStepFloat(
+    const float *input_ptr, const float *input_to_input_weights_ptr,
+    const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
+    const float *input_to_output_weights_ptr, const float *aux_input_ptr,
+    const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
+    const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
+    const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
+    const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
+    const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
+    const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
+    const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
+    const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
+    const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
+    const float *output_gate_bias_ptr, const float *projection_weights_ptr,
+    const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell,
+    int n_input, int n_aux_input, int n_output, int output_batch_leading_dim,
+    float *output_state_ptr, float *cell_state_ptr, float *scratch0, float *scratch1,
+    float *scratch2, float *scratch3, float *output_ptr)
+{
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+
+  // Make named scratch buffers.
+  float *input_gate_scratch = scratch0;
+  float *forget_gate_scratch = scratch1;
+  float *cell_gate_scratch = scratch2;
+  float *output_gate_scratch = scratch3;
+
+  // Check if inputs are all zeros so we can skip some computations.
+  const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+      (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+  if (!use_cifg)
+  {
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+                           aux_input_to_input_weights_ptr, output_state_ptr,
+                           recurrent_to_input_weights_ptr, cell_state_ptr,
+                           cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
+                           input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+                           /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+                           input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+  }
+  // Calculate the forget gate.
+  CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
+                         aux_input_to_forget_weights_ptr, output_state_ptr,
+                         recurrent_to_forget_weights_ptr, cell_state_ptr,
+                         cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
+                         forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+                         /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+                         forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+  // Calculate the cell update gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
+      output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+      /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr,
+      n_batch, n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
+      is_input_all_zeros, is_aux_input_all_zeros);
+  // Update the cell state.
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
+                      cell_gate_scratch, use_cifg, params->cell_clip);
+  // Calculate output gate.
+  CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr,
+                         aux_input_to_output_weights_ptr, output_state_ptr,
+                         recurrent_to_output_weights_ptr, cell_state_ptr,
+                         cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
+                         output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+                         /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+                         output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+  // Update the output state.
+  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+                           params->activation, projection_weights_ptr, projection_bias_ptr,
+                           params->proj_clip, output_state_ptr, scratch2);
+  // Copy output state to the output. Note that the output's rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
+  for (int b = 0; b < n_batch; b++)
+  {
+    std::copy_n(output_state_ptr + b * n_output, n_output,
+                output_ptr + b * output_batch_leading_dim);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h
new file mode 100644
index 000000000..326a44f0c
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogSoftMax.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGSOFTMAX_H__
+#define __NNFW_CKER_LOGSOFTMAX_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+#include "cker/eigen/Utils.h"
+
+#include <Eigen/Core>
+#include <fixedpoint/fixedpoint.h>
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void LogSoftmax(const SoftmaxParams &params, const Shape &input_shape,
+                       const float *input_data, const Shape &output_shape, float *output_data)
+{
+  const int rank = input_shape.DimensionsCount();
+  const int axis = (params.axis < 0) ? params.axis + rank : params.axis;
+  const double beta = params.beta;
+  const int depth = MatchingDim(input_shape, axis, output_shape, axis);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  for (int i = axis + 1; i < rank; ++i)
+  {
+    inner_size *= input_shape.Dims(i);
+  }
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < inner_size; ++j)
+    {
+      float max = std::numeric_limits<float>::lowest();
+      for (int c = 0; c < depth; ++c)
+      {
+        max = std::max(max, input_data[(i * depth + c) * inner_size]);
+      }
+
+      float sum = 0.f;
+      for (int c = 0; c < depth; ++c)
+      {
+        sum += std::exp((input_data[(i * depth + c) * inner_size + j] - max) * beta);
+      }
+
+      const float log_sum = std::log(sum);
+      for (int c = 0; c < depth; ++c)
+      {
+        output_data[(i * depth + c) * inner_size + j] =
+            (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
+      }
+    }
+  }
+}
+
+inline void LogSoftmax(const SoftmaxParams &params, float input_scale, const Shape &input_shape,
+                       const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+  const int rank = input_shape.DimensionsCount();
+  const int axis = (params.axis < 0) ? params.axis + rank : params.axis;
+  const double beta = params.beta;
+  const int depth = MatchingDim(input_shape, axis, output_shape, axis);
+
+  const int32_t clamp_max = std::numeric_limits<uint8_t>::max();
+  const int32_t clamp_min = std::numeric_limits<uint8_t>::min();
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  for (int i = axis + 1; i < rank; ++i)
+  {
+    inner_size *= input_shape.Dims(i);
+  }
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < inner_size; ++j)
+    {
+      uint8_t max_val = std::numeric_limits<uint8_t>::min();
+      for (int c = 0; c < depth; ++c)
+      {
+        max_val = std::max(max_val, input_data[(i * depth + c) * inner_size]);
+      }
+
+      float sum_exp = 0.0f;
+      const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+      const float *table_offset = &params.table[max_uint8 - max_val];
+      for (int c = 0; c < depth; ++c)
+      {
+        sum_exp += table_offset[input_data[(i * depth + c) * inner_size]];
+      }
+      const float log_sum_exp = std::log(sum_exp);
+
+      const float scale = input_scale / params.scale;
+      const float precomputed = (input_scale * max_val * beta + log_sum_exp) / params.scale;
+      for (int c = 0; c < depth; ++c)
+      {
+        const float log_prob =
+            scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
+        const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
+        output_data[(i * depth + c) * inner_size] =
+            static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGSOFTMAX_H__
diff --git a/compute/cker/include/cker/operation/LogicalNot.h b/compute/cker/include/cker/operation/LogicalNot.h
new file mode 100644
index 000000000..5e8d38b45
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogicalNot.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGICALNOT_H__
+#define __NNFW_CKER_LOGICALNOT_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void LogicalNot(const Shape &input_shape, const bool *input_data, const Shape &output_shape,
+                       bool *output_data)
+{
+  const int size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] = !input_data[i];
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGICALNOT_H__
diff --git a/compute/cker/include/cker/operation/LogicalOr.h b/compute/cker/include/cker/operation/LogicalOr.h
new file mode 100644
index 000000000..ec07c23d9
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogicalOr.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGICAL_OR_H__
+#define __NNFW_CKER_LOGICAL_OR_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void LogicalOrBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+                               const Shape &unextended_input2_shape, const T *input2_data,
+                               const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = in1_val || in2_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void LogicalOrElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+                                 T *output_data)
+{
+
+  int num_elements = shape.FlatSize();
+
+  for (int t = 0; t < num_elements; t++)
+  {
+    output_data[t] = input1_data[t] || input2_data[t];
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGICAL_OR_H___
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
new file mode 100644
index 000000000..3d3e59e55
--- /dev/null
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGISTIC_H__
+#define __NNFW_CKER_LOGISTIC_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+#include <cmath>
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                     float *output_data)
+{
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGISTIC_H__
diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h
new file mode 100644
index 000000000..5674ff3ef
--- /dev/null
+++ b/compute/cker/include/cker/operation/MatrixBandPart.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_MATRIX_BAND_PART_H__
+#define __NNFW_CKER_MATRIX_BAND_PART_H__
+
+#include "cker/Shape.h"
+
+#include <algorithm>
+
+namespace nnfw
+{
+namespace cker
+{
+template <typename T>
+void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shape &input_shape,
+                    const float *input_data, const Shape &output_shape, float *output_data)
+{
+  auto last_dim = input_shape.DimensionsCount() - 1;
+
+  T batch_num = 1;
+  for (int dim = 0; dim < input_shape.DimensionsCount() - 2; dim++)
+  {
+    batch_num *= input_shape.Dims(dim);
+  }
+
+  const T row_num = input_shape.Dims(last_dim - 1);
+  const T col_num = input_shape.Dims(last_dim);
+
+  if (!(num_lower_diags <= row_num))
+    throw std::runtime_error(
+        "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
+
+  if (!(num_upper_diags <= col_num))
+    throw std::runtime_error(
+        "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
+
+  std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
+
+  // reference code, without multithreading
+  for (T batch = 0; batch < batch_num; ++batch)
+  {
+    for (T row = 0; row < row_num; ++row)
+    {
+      auto output = output_data + (batch * row_num * col_num + row * col_num);
+      auto input = input_data + (batch * row_num * col_num + row * col_num);
+
+      const T band_start =
+          num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
+      const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num),
+                                                                  row + num_upper_diags + 1);
+
+      for (T band_idx = band_start; band_idx < band_end; band_idx++)
+      {
+        output[band_idx] = input[band_idx];
+      }
+    }
+  }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_MATRIX_BAND_PART_H__
diff --git a/compute/cker/include/cker/operation/MaxMin.h b/compute/cker/include/cker/operation/MaxMin.h
new file mode 100644
index 000000000..691b3b0b3
--- /dev/null
+++ b/compute/cker/include/cker/operation/MaxMin.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_MAXMIN_H__
+#define __NNFW_CKER_MAXMIN_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct MaximumOp
+{
+  template <typename data_type> static data_type op(data_type el1, data_type el2)
+  {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp
+{
+  template <typename data_type> static data_type op(data_type el1, data_type el2)
+  {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+template <typename T, typename Op>
+inline void
+MaximumMinimumBroadcast4DSlow(const Shape &unextended_input1_shape, const T *input1_data,
+                              const Shape &unextended_input2_shape, const T *input2_data,
+                              const Shape &unextended_output_shape, T *output_data, Op op)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = op(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Max(const Shape &unextended_input1_shape, const T *input1_data,
+                const Shape &unextended_input2_shape, const T *input2_data,
+                const Shape &unextended_output_shape, T *output_data)
+{
+  MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape,
+                                   input2_data, unextended_output_shape, output_data,
+                                   MaximumOp::template op<T>);
+}
+
+template <typename T>
+inline void Min(const Shape &unextended_input1_shape, const T *input1_data,
+                const Shape &unextended_input2_shape, const T *input2_data,
+                const Shape &unextended_output_shape, T *output_data)
+{
+  MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape,
+                                   input2_data, unextended_output_shape, output_data,
+                                   MinimumOp::template op<T>);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_MAXMIN_H__
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
new file mode 100644
index 000000000..ea3fcaca6
--- /dev/null
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_MAX_POOL_H__
+#define __NNFW_CKER_MAX_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+#include "cker/eigen/Utils.h"
+
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T> void MaxPool(const PoolParams &, const Shape &, const T *, const Shape &, T *)
+{
+  static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value,
+                "cker::MaxPool : This function supports only integer or floating point");
+  throw std::runtime_error("cker::MaxPool : Unsupported data type");
+}
+
+template <>
+void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  // Prefill the output to minimum representable float value
+  out_mat.setConstant(std::numeric_limits<float>::lowest());
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int h = 0; h < input_height; ++h)
+    {
+      for (int w = 0; w < input_width; ++w)
+      {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + params.padding_values.height;
+        int wpad = w + params.padding_values.width;
+        int h_start =
+            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start =
+            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph)
+        {
+          for (int pw = w_start; pw < w_end; ++pw)
+          {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) =
+                out_mat.col(out_offset)
+                    .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+          }
+        }
+      }
+    }
+  }
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
+                                                  params.float_activation_max);
+  }
+}
+
+template <>
+void MaxPool<uint8_t>(const PoolParams &params, const Shape &input_shape, const uint8_t *input_data,
+                      const Shape &output_shape, uint8_t *output_data)
+{
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  assert(params.quantized_activation_min <= params.quantized_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  uint8_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+    {
+      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8_t *input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++)
+          {
+            const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++)
+            {
+              const uint8_t *input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16)
+              {
+                uint8x16_t acc_reg = vld1q_u8(acc + channel);
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg = vmaxq_u8(acc_reg, input_reg);
+                vst1q_u8(acc + channel, acc_reg);
+              }
+
+              for (; channel <= tranche_depth - 8; channel += 8)
+              {
+                uint8x8_t acc_reg = vld1_u8(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vmax_u8(acc_reg, input_reg);
+                vst1_u8(acc + channel, acc_reg);
+              }
+#endif
+              for (; channel < tranche_depth; ++channel)
+              {
+                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
+              }
+              input_row_ptr += depth;
+            }
+          }
+          uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 16; channel += 16)
+          {
+            uint8x16_t a = vld1q_u8(acc + channel);
+            a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
+            a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
+            vst1q_u8(output_ptr + channel, a);
+          }
+          for (; channel <= tranche_depth - 8; channel += 8)
+          {
+            uint8x8_t a = vld1_u8(acc + channel);
+            a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
+            a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, a);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel)
+          {
+            uint8_t a = acc[channel];
+            a = std::max<uint8_t>(a, params.quantized_activation_min);
+            a = std::min<uint8_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h
new file mode 100644
index 000000000..c0dbc6df5
--- /dev/null
+++ b/compute/cker/include/cker/operation/OneHot.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ONEHOT_H__
+#define __NNFW_CKER_ONEHOT_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T, typename TI>
+void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t axis,
+            const Shape &indices_shape, const TI *indices_data, const Shape &, T *output_data)
+{
+  if (axis == -1)
+    axis = indices_shape.DimensionsCount();
+
+  // prefix_dim_size == # of elements before the axis
+  // depth == # of elements per axis
+  // suffix_dim_size == # of elements after the axis
+  int prefix_dim_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    prefix_dim_size *= indices_shape.Dims(i);
+  }
+  const int suffix_dim_size = indices_shape.FlatSize() / prefix_dim_size;
+
+  // View the indices as a matrix of size:
+  //     prefix_dim_size x suffix_dim_size
+  // View the output as a matrix of size:
+  //     prefix_dim_size x depth x suffix_dim_size
+  // Then the output is:
+  //     output(i, j, k) == (indices(i, k) == j) ? on : off
+  for (int i = 0; i < prefix_dim_size; ++i)
+  {
+    for (int j = 0; j < depth; ++j)
+    {
+      for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
+      {
+        *output_data =
+            static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ONEHOT_H__
diff --git a/compute/cker/include/cker/operation/Pack.h b/compute/cker/include/cker/operation/Pack.h
new file mode 100644
index 000000000..fd865047d
--- /dev/null
+++ b/compute/cker/include/cker/operation/Pack.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_PACK_H__
+#define __NNFW_CKER_PACK_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+inline void Pack(const PackParams &params, const Scalar *const *input_data,
+                 const Shape &output_shape, Scalar *output_data)
+{
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+
+  for (int i = 0; i < inputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      const Scalar *input_ptr = input_data[i] + copy_size * k;
+      int loc = k * inputs_count * copy_size + i * copy_size;
+      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_PACK_H__
diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
new file mode 100644
index 000000000..4a2732d82
--- /dev/null
+++ b/compute/cker/include/cker/operation/Pad.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_PAD_H__
+#define __NNFW_CKER_PAD_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+template <typename T>
+inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
+                const T *input_data, const Shape &output_shape, T *output_data,
+                const T *constant_value_data)
+{
+  // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
+  // TODO: come up with more subtle solution that uses subtensors like arm compute
+  // TODO: Check if it works for all layouts
+
+  using PaddingInfo = std::pair<int32_t, int32_t>;
+  /** List of padding information */
+  using PaddingList = std::vector<PaddingInfo>;
+
+  const T constant_value = constant_value_data ? *constant_value_data : 0;
+  assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
+
+  PaddingList padding_list(pad_rank);
+  for (int32_t n = 0; n < pad_rank; ++n)
+  {
+    const int32_t *from = padding_data + (n * 2);
+    padding_list[n] = {from[0], from[1]};
+  }
+  for (int32_t i = 0; i < pad_rank; ++i)
+  {
+    assert(output_shape.Dims(i) ==
+           input_shape.Dims(i) + padding_list[i].first + padding_list[i].second);
+  }
+  /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker
+     functions:
+     1. to prevent access violation in padding_list;
+     2. handling as 4d is slower than as 2d/3d.
+  */
+  switch (pad_rank)
+  {
+    case 0:
+    case 1:
+    {
+      const int32_t in_row_len = input_shape.Dims(0);
+      std::fill_n(output_data, padding_list[0].first, constant_value);
+      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
+      std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
+                  constant_value);
+      break;
+    }
+    case 2: // HW
+    {
+      const int32_t in_row_len = input_shape.Dims(1);
+      const int32_t out_row_size = output_shape.Dims(1);
+
+      // prepend padding rows
+      std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value);
+
+      const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+      for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j)
+      {
+        auto out_offset = i * out_row_size;
+        const auto in_offset = j * in_row_len;
+
+        // prepend padding values
+        std::fill_n(output_data + out_offset, padding_list[1].first, constant_value);
+
+        out_offset += padding_list[1].first;
+
+        // copy a row of input data
+        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
+
+        out_offset += in_row_len;
+
+        // append padding values
+        std::fill_n(output_data + out_offset, padding_list[1].second, constant_value);
+      }
+
+      // append padding rows
+      std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size,
+                  constant_value);
+      break;
+    }
+    case 3: // HWC
+    {
+      const int32_t in_row_len = input_shape.Dims(2);
+      const int32_t out_row_size = output_shape.Dims(2);
+      const auto plain_size = out_row_size * output_shape.Dims(1);
+
+      // prepend padding plains
+      std::fill_n(output_data, padding_list[0].first * plain_size, constant_value);
+
+      const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+      for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp)
+      {
+        const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2);
+
+        // prepend padding rows
+        std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size,
+                    constant_value);
+
+        const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first;
+        for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp)
+        {
+          auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2);
+          const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2);
+
+          // prepend padding values
+          std::fill_n(output_data + out_offset, padding_list[2].first, constant_value);
+
+          out_offset += padding_list[2].first;
+
+          // copy a row of input data
+          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
+
+          out_offset += in_row_len;
+
+          // append padding values
+          std::fill_n(output_data + out_offset, padding_list[2].second, constant_value);
+        }
+
+        // append padding rows
+        std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
+                    padding_list[1].second * out_row_size, constant_value);
+      }
+
+      // append padding plains
+      std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size,
+                  constant_value);
+      break;
+    }
+    case 4:
+    {
+      auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t {
+        return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3);
+      };
+      const int32_t in_row_len = input_shape.Dims(3);
+      const int32_t out_row_size = output_shape.Dims(3);
+      const auto plain_size = out_row_size * output_shape.Dims(2);
+      const auto parallelepiped_size = plain_size * output_shape.Dims(1);
+
+      // prepend padding parallelepipeds
+      std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value);
+
+      const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+      for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp)
+      {
+        const auto out_h_offset = get_offset(output_shape, i, 0, 0);
+        // prepend padding plains
+        std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value);
+
+        const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first;
+        for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp)
+        {
+          const auto out_w_offset = get_offset(output_shape, i, j, 0);
+
+          // prepend padding rows
+          std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size,
+                      constant_value);
+
+          const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first;
+          for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp)
+          {
+            auto out_c_offset = get_offset(output_shape, i, j, k);
+            const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp);
+
+            // prepend padding values
+            std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value);
+
+            out_c_offset += padding_list[3].first;
+
+            // copy a row of input data
+            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
+
+            out_c_offset += in_row_len;
+
+            // append padding values
+            std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value);
+          }
+
+          // append padding rows
+          std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
+                      padding_list[2].second * out_row_size, constant_value);
+        }
+
+        // append padding plains
+        std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size,
+                    padding_list[1].second * plain_size, constant_value);
+      }
+      // append padding parallelepipeds
+      std::fill_n(output_data + r_b_inp_lim * parallelepiped_size,
+                  padding_list[0].second * parallelepiped_size, constant_value);
+      break;
+    }
+    default:
+      throw std::runtime_error("Padding for rank > 4 NYI");
+      break;
+  }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_PAD_H__
diff --git a/compute/cker/include/cker/operation/Pow.h b/compute/cker/include/cker/operation/Pow.h
new file mode 100644
index 000000000..1214e0964
--- /dev/null
+++ b/compute/cker/include/cker/operation/Pow.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_POW_H__
+#define __NNFW_CKER_POW_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void powImpl(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+                    const T *input2_data, const Shape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = std::pow(input1_data[i], input2_data[i]);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_POW_H__
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
new file mode 100644
index 000000000..5c82d111f
--- /dev/null
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_QUANTIZE_H__
+#define __NNFW_CKER_QUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+template <typename InputT, typename OutputT>
+inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape,
+                     OutputT *output_data, const float output_scale, const int32_t output_offset)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  int min_val = std::numeric_limits<OutputT>::min();
+  int max_val = std::numeric_limits<OutputT>::max();
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_QUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h
new file mode 100644
index 000000000..5c3a773a2
--- /dev/null
+++ b/compute/cker/include/cker/operation/Range.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RANGE_H__
+#define __NNFW_CKER_RANGE_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+#include <stdexcept>
+
+namespace nnfw
+{
+namespace cker
+{
+template <typename T> inline int GetSize(T start, T limit, T delta)
+{
+  if (!((start > limit && delta < 0) || (start < limit && delta > 0)))
+  {
+    throw std::runtime_error("Range: invalid input values");
+  }
+
+  int size = (std::is_integral<T>::value
+                  ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+                  : std::ceil(std::abs((limit - start) / delta)));
+  return size;
+}
+
+template <typename T>
+inline void Range(const T *start_data, const T *limit_data, const T *delta_data, T *output_data)
+{
+  const T start_value = *start_data;
+  const T delta_value = *delta_data;
+  const T limit_value = *limit_data;
+
+  const int num_elements = GetSize<T>(start_value, limit_value, delta_value);
+  T value = start_value;
+
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_data[i] = value;
+    value += delta_value;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RANGE_H__
diff --git a/compute/cker/include/cker/operation/ReLU.h b/compute/cker/include/cker/operation/ReLU.h
new file mode 100644
index 000000000..2a6cc4a98
--- /dev/null
+++ b/compute/cker/include/cker/operation/ReLU.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RELU_H__
+#define __NNFW_CKER_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+#include <cmath>
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ReLU(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                 float *output_data)
+{
+  const auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map = input_map.cwiseMax(0.0f);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU_H__
diff --git a/compute/cker/include/cker/operation/ReLU6.h b/compute/cker/include/cker/operation/ReLU6.h
new file mode 100644
index 000000000..20df561dc
--- /dev/null
+++ b/compute/cker/include/cker/operation/ReLU6.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RELU6_H__
+#define __NNFW_CKER_RELU6_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+#include <cmath>
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ReLU6(const Shape &input_shape, const float *input_data, float *output_data)
+{
+  int size = input_shape.FlatSize();
+
+  for (int i = 0; i < size; ++i)
+  {
+    if (input_data[i] <= 0)
+    {
+      output_data[i] = 0;
+    }
+    else if (input_data[i] > 6.0)
+    {
+      output_data[i] = 6.0;
+    }
+    else
+    {
+      output_data[i] = input_data[i];
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU6_H__
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
new file mode 100644
index 000000000..2b2e8d338
--- /dev/null
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REDUCE_H__
+#define __NNFW_CKER_REDUCE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis.
+
+#ifdef USE_NEON
+inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
+                               float *output_data)
+{
+  const auto input_dims = input_shape.DimsData();
+  const auto input_num_dims = input_shape.DimensionsCount();
+
+  int input_size = 1;
+  int reduce_size = 0;
+  for (int idx = 0; idx < input_num_dims - 1; idx++)
+  {
+    input_size *= input_dims[idx];
+  }
+  reduce_size = input_dims[input_num_dims - 1];
+  for (int idx = 0; idx < input_size; idx++)
+  {
+    int r_idx = 0;
+    float tmp_data[4] = {
+        0,
+    };
+    float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
+    for (; r_idx <= reduce_size - 32; r_idx += 32)
+    {
+      float32x4_t a10 = vld1q_f32(input_data + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + r_idx + 12);
+      float32x4_t a20 = vld1q_f32(input_data + r_idx + 16);
+      float32x4_t a21 = vld1q_f32(input_data + r_idx + 20);
+      float32x4_t a22 = vld1q_f32(input_data + r_idx + 24);
+      float32x4_t a23 = vld1q_f32(input_data + r_idx + 28);
+
+      float32x4_t x0 = vaddq_f32(a10, a20);
+      float32x4_t x1 = vaddq_f32(a11, a21);
+      float32x4_t x2 = vaddq_f32(a12, a22);
+      float32x4_t x3 = vaddq_f32(a13, a23);
+
+      float32x4_t y0 = vaddq_f32(x0, x1);
+      float32x4_t y1 = vaddq_f32(x2, x3);
+      float32x4_t y2 = vaddq_f32(y0, y1);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
+    }
+    for (; r_idx <= reduce_size - 8; r_idx += 8)
+    {
+      float32x4_t a1 = vld1q_f32(input_data + r_idx);
+      float32x4_t a2 = vld1q_f32(input_data + r_idx + 4);
+      float32x4_t x = vaddq_f32(a1, a2);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
+    }
+    vst1q_f32(tmp_data, tmp_data_32x4);
+    output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
+
+    for (; r_idx < reduce_size; r_idx++)
+    {
+      if (r_idx == 0)
+      {
+        output_data[idx] = input_data[idx * reduce_size];
+      }
+      else
+      {
+        output_data[idx] += input_data[idx * reduce_size + r_idx];
+      }
+    }
+  }
+}
+#endif // NEON
+
+template <typename In, typename Out>
+inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
+                       const int *axis, const int num_axis, int *input_iter,
+                       Out reducer(const Out current, const In in), Out *output_data)
+{
+  const auto input_dims = input_shape.DimsData();
+  const auto input_num_dims = input_shape.DimensionsCount();
+
+  // Reset input iterator.
+  if (num_axis == 1 && axis[0] == input_num_dims - 1)
+  {
+    int input_size = 1;
+    int reduce_size = 0;
+    for (int idx = 0; idx < input_num_dims - 1; idx++)
+    {
+      input_size *= input_dims[idx];
+    }
+    reduce_size = input_dims[input_num_dims - 1];
+    for (int idx = 0; idx < input_size; idx++)
+    {
+      for (int r_idx = 0; r_idx < reduce_size; r_idx++)
+      {
+        if (r_idx == 0)
+        {
+          output_data[idx] = input_data[idx * reduce_size];
+        }
+        else
+        {
+          output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
+        }
+      }
+    }
+    return true;
+  }
+
+  for (int idx = 0; idx < input_num_dims; ++idx)
+  {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do
+  {
+    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+    output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+// This method parses the input 'axis' to remove duplicates and handle negative
+// values, and returns a valid 'out_axis'
+inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis,
+                        int *out_num_axis)
+{
+  auto num_axis = axes.size();
+  auto axis = axes.data();
+
+  *out_num_axis = 0; // Just in case.
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0)
+  {
+    return true;
+  }
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (size_t idx = 0; idx < num_axis; ++idx)
+  {
+    // Handle negative index. A positive index 'p_idx' can be represented as a
+    // negative index 'n_idx' as: n_idx = p_idx-num_dims
+    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    assert(current >= 0 && current < num_dims);
+    bool is_dup = false;
+    for (int j = 0; j < *out_num_axis; ++j)
+    {
+      if (out_axis[j] == current)
+      {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup)
+    {
+      out_axis[*out_num_axis] = current;
+      *out_num_axis += 1;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
+{
+  const auto dims = shape.DimsData();
+  const auto num_dims = shape.DimensionsCount();
+  size_t num_elements = 1;
+  for (int idx = 0; idx < num_dims; ++idx)
+  {
+    size_t current = static_cast<size_t>(dims[idx]);
+    // Overflow prevention.
+    if (num_elements > std::numeric_limits<size_t>::max() / current)
+    {
+      return false;
+    }
+    num_elements *= current;
+  }
+  for (size_t idx = 0; idx < num_elements; ++idx)
+  {
+    data[idx] = init_value;
+  }
+  return true;
+}
+
+class Reduce
+{
+public:
+  Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
+
+  void prepare(size_t temp_index_size, size_t resolved_axis_size)
+  {
+    if (_prepared)
+      return;
+
+    // prepare space for temp_index and resolved_axis
+    if (temp_index_size > kMaxSmallSize)
+      _temp_index.resize(temp_index_size);
+    if (resolved_axis_size > kMaxSmallSize)
+      _resolved_axis.resize(resolved_axis_size);
+    _prepared = true;
+  }
+
+  // Computes the generic value (i.e., sum/max/min/prod) of elements across
+  // dimensions given in axis. It needs to pass in init_value and reducer.
+  template <typename T>
+  inline bool ReduceGeneric(const Shape &input_shape, const T *input_data,
+                            const Shape &output_shape, T *output_data, const std::vector<int> &axes,
+                            bool, T init_value, T reducer(const T current, const T in))
+  {
+    // Reset output data.
+    if (!InitTensorDataForReduce(output_shape, init_value, output_data))
+    {
+      return false;
+    }
+
+    // Resolve axis.
+    int num_resolved_axis = 0;
+    if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
+    {
+      return false;
+    }
+
+    return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(),
+                            num_resolved_axis, temp_index_data(), reducer, output_data);
+  }
+
+  // Computes the mean of elements across dimensions given in axis.
+  // It does so in two stages, first calculates the sum of elements along the axis
+  // then divides it by the number of element in axis for quantized values.
+  template <typename T, typename U>
+  inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
+                                 const Shape &input_shape, T *output_data,
+                                 int32_t output_zero_point, float output_scale,
+                                 const Shape &output_shape, const std::vector<int> &axes,
+                                 bool /*keep_dims*/, U *temp_sum, bool compute_sum,
+                                 U reducer(const U current, const T in))
+  {
+    // Reset output data.
+    size_t num_outputs = 1;
+    for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
+    {
+      size_t current = static_cast<size_t>(output_shape.Dims(idx));
+      // Overflow prevention.
+      if (num_outputs > std::numeric_limits<size_t>::max() / current)
+      {
+        return false;
+      }
+      num_outputs *= current;
+    }
+    for (size_t idx = 0; idx < num_outputs; ++idx)
+    {
+      output_data[idx] = T();
+      temp_sum[idx] = U();
+    }
+
+    // Resolve axis.
+    int num_resolved_axis = 0;
+    if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
+    {
+      return false;
+    }
+
+    if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
+                          num_resolved_axis, temp_index_data(), reducer, temp_sum))
+    {
+      return false;
+    }
+
+    // Calculate mean by dividing output_data by num of aggregated element.
+    U num_elements_in_axis = 1;
+    for (int idx = 0; idx < num_resolved_axis; ++idx)
+    {
+      size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
+      // Overflow prevention.
+      if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
+      {
+        return false;
+      }
+      num_elements_in_axis *= current;
+    }
+
+    if (num_elements_in_axis > 0)
+    {
+      const float scale = input_scale / output_scale;
+      if (compute_sum)
+      {
+        // TODO(b/116341117): Eliminate float and do this completely in 8bit.
+        const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
+        for (size_t idx = 0; idx < num_outputs; ++idx)
+        {
+          const U value =
+              static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
+          output_data[idx] = static_cast<T>(value);
+        }
+      }
+      else
+      {
+        const float bias = -input_zero_point * scale + 0.5f;
+        for (size_t idx = 0; idx < num_outputs; ++idx)
+        {
+          float float_mean =
+              static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
+          float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
+                                  static_cast<float>(std::numeric_limits<T>::max()));
+          result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+          output_data[idx] = static_cast<T>(result);
+        }
+      }
+    }
+    return true;
+  }
+
+  inline int32_t *resolved_axis_data(void)
+  {
+    return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
+  }
+  inline int32_t *temp_index_data(void)
+  {
+    return _temp_index.size() ? _temp_index.data() : _temp_index_small;
+  }
+
+private:
+  std::vector<int> _temp_index;
+  std::vector<int> _resolved_axis;
+  bool _prepared;
+  static constexpr int kMaxSmallSize = 4;
+  int _temp_index_small[kMaxSmallSize];
+  int _resolved_axis_small[kMaxSmallSize];
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REDUCE_H__
diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h
new file mode 100644
index 000000000..2e4fc6274
--- /dev/null
+++ b/compute/cker/include/cker/operation/ReduceMean.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REDUCEMEAN_H__
+#define __NNFW_CKER_REDUCEMEAN_H__
+
+#include "cker/Shape.h"
+#include "cker/operation/Reduce.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+float round_nearest(float value)
+{
+  if (value < 0)
+  {
+    return static_cast<float>(static_cast<int>(value - 0.5f));
+  }
+  else
+  {
+    return static_cast<float>(static_cast<int>(value + 0.5f));
+  }
+}
+template <typename Out, typename In>
+Out mean_reducer(const Out data1, const In data2, int normalizer)
+{
+  return data1 + static_cast<Out>(data2) / normalizer;
+}
+
+template <typename In> int sum_reducer(const int data1, const In data2)
+{
+  return data1 + static_cast<int>(data2);
+}
+
+template <typename In, typename Out>
+inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const int *axis,
+                           const int num_axis, int *input_iter,
+                           Out reducer(const Out current, const In in, int normalizer),
+                           Out *output_data)
+{
+  const auto input_dims = input_shape.DimsData();
+  const auto input_num_dims = input_shape.DimensionsCount();
+  int normalizer = 1;
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx)
+  {
+    input_iter[idx] = 0;
+  }
+  // Compute number of output elements
+  for (int idx = 0; idx < num_axis; ++idx)
+  {
+    normalizer *= input_dims[axis[idx]];
+  }
+  // Iterate through input_data.
+  do
+  {
+    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+    output_data[output_offset] =
+        reducer(output_data[output_offset], input_data[input_offset], normalizer);
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+template <typename In>
+inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape, const int *axis,
+                                 const int num_axis, int *input_iter,
+                                 int reducer(const int current, const In in), int *temp_sum)
+{
+  const auto input_dims = input_shape.DimsData();
+  const auto input_num_dims = input_shape.DimensionsCount();
+  size_t normalizer = 1;
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx)
+  {
+    input_iter[idx] = 0;
+  }
+  // Compute number of output elements
+  for (int idx = 0; idx < num_axis; ++idx)
+  {
+    normalizer *= input_dims[axis[idx]];
+  }
+  // Iterate through input_data.
+  do
+  {
+    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+    temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return normalizer;
+}
+
+class ReduceMean : public Reduce
+{
+public:
+  ReduceMean() : Reduce(){};
+
+  template <typename T>
+  int PrepareforReduce(const Shape &input_shape, const Shape &output_shape,
+                       const std::vector<int> &axes, T *output_data, T init_value)
+  {
+    // Reset output data.
+    if (!InitTensorDataForReduce(output_shape, init_value, output_data))
+    {
+      return -1;
+    }
+    const auto input_dims = input_shape.DimsData();
+    const int num_dims = input_shape.DimensionsCount();
+    int resolved_axis_size = 1;
+    const auto num_axes = axes.size();
+
+    for (size_t idx = 0; idx < num_axes; idx++)
+    {
+      int current = axes[idx] < 0 ? (axes[idx] + num_dims) : axes[idx];
+      assert(current >= 0 && current < num_dims);
+      resolved_axis_size *= input_dims[current];
+    }
+
+    prepare(num_dims, resolved_axis_size);
+
+    // Resolve axis.
+    int num_resolved_axis = 0;
+    if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
+    {
+      return -1;
+    }
+
+    return num_resolved_axis;
+  }
+
+  // Computes the generic value (i.e., sum/max/min/prod) of elements across
+  // dimensions given in axis. It needs to pass in init_value and reducer.
+  template <typename In, typename Out>
+  inline bool ReduceOp(const Shape &input_shape, const In *input_data, const Shape &output_shape,
+                       Out *output_data, const std::vector<int> &axes, bool, Out init_value,
+                       Out reducer(const Out current, const Out in, int normalizer))
+  {
+    int num_resolved_axis;
+    num_resolved_axis = PrepareforReduce(input_shape, output_shape, axes, output_data, init_value);
+    if (num_resolved_axis == -1)
+    {
+      return false;
+    }
+    return ReduceMeanImpl<In, Out>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
+                                   temp_index_data(), reducer, output_data);
+  }
+
+  template <typename In, typename Out>
+  inline bool ReduceOp(const Shape &input_shape, const In *input_data, float input_scale,
+                       int32_t input_offset, const Shape &output_shape, Out *output_data,
+                       float output_scale, int32_t output_offset, const std::vector<int> &axes,
+                       bool, Out init_value, int reducer(const int current, const In in))
+  {
+    size_t num_outputs = 1;
+    auto output_dims = output_shape.DimsData();
+
+    for (size_t idx = 0; idx < static_cast<size_t>(output_shape.DimensionsCount()); idx++)
+    {
+      num_outputs *= output_dims[idx];
+    }
+    _temp_sum.resize(num_outputs, 0);
+    int num_resolved_axis;
+    num_resolved_axis = PrepareforReduce(input_shape, output_shape, axes, output_data, init_value);
+    if (num_resolved_axis == -1)
+    {
+      return false;
+    }
+
+    size_t normalizer =
+        ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
+                               temp_index_data(), reducer, _temp_sum.data());
+    if (num_outputs > 0)
+    {
+      float scale = input_scale / output_scale;
+      float bias = -input_offset * scale;
+      for (size_t idx = 0; idx < num_outputs; idx++)
+      {
+        float float_mean = static_cast<float>(_temp_sum[idx]) / normalizer;
+        float result = std::min(round_nearest(float_mean * scale + bias + output_offset),
+                                static_cast<float>(std::numeric_limits<Out>::max()));
+        result = std::max(result, static_cast<float>(std::numeric_limits<Out>::min()));
+        output_data[idx] = static_cast<Out>(result);
+      }
+    }
+    return false;
+  }
+
+private:
+  std::vector<int> _temp_sum;
+};
+
+template <typename In, typename Out>
+void Mean(const Shape &input_shape, const In *input_data, const Shape &output_shape,
+          Out *output_data, const std::vector<int> &axes)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input_shape.DimensionsCount() > 0);
+  ReduceMean m_obj;
+  m_obj.ReduceOp<In, Out>(input_shape, input_data, output_shape, output_data, axes, true, (Out)0,
+                          mean_reducer);
+}
+
+template <typename In, typename Out>
+void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_scale,
+                 int32_t input_offset, const Shape &output_shape, Out *output_data,
+                 float output_scale, int32_t output_offset, const std::vector<int> &axes)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input_shape.DimensionsCount() > 0);
+  ReduceMean m_obj;
+  m_obj.ReduceOp<In, Out>(input_shape, input_data, input_scale, input_offset, output_shape,
+                          output_data, output_scale, output_offset, axes, true, (Out)0,
+                          sum_reducer);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REDUCEMEAN_H__
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
new file mode 100644
index 000000000..7fc1e9123
--- /dev/null
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RESIZEBILINEAR_H__
+#define __NNFW_CKER_RESIZEBILINEAR_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x,
+                                    int32_t y, int32_t depth, int32_t batch,
+                                    const Shape &input_shape, const float *input_data,
+                                    const Shape &output_shape, float *output_data)
+{
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t output_width = output_shape.Dims(2);
+
+  const int32_t input_x_offset = (x1 - x0) * depth;
+  const int32_t input_y_offset = (y1 - y0) * depth * input_width;
+  const int32_t output_x_offset = depth;
+  const int32_t output_y_offset = depth * output_width;
+
+  for (int ch = 0; ch < depth; ch++)
+  {
+    const int32_t input_offset = Offset(input_shape, batch, y0, x0, ch);
+
+    float x0y0 = input_data[input_offset];
+    float x1y0 = input_data[input_offset + input_x_offset];
+    float x0y1 = input_data[input_offset + input_y_offset];
+    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+    // Top left corner.
+    const int32_t output_offset = Offset(output_shape, batch, y, x, ch);
+    output_data[output_offset] = x0y0;
+
+    // Top right corner.
+    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+    // Bottom left corner.
+    float output = (x0y0 + x0y1) / 2;
+    output_data[output_offset + output_y_offset] = output;
+
+    // Bottom right corner.
+    output_data[output_offset + output_x_offset + output_y_offset] =
+        (output + ((x1y0 + x1y1) / 2)) / 2;
+  }
+}
+
+inline void ResizeBilinear2x2(int32_t batches, int32_t input_height, int32_t input_width,
+                              int32_t depth, int32_t output_height, int32_t output_width,
+                              const Shape &input_shape, const float *input_data,
+                              const Shape &output_shape, float *output_data)
+{
+  for (int b = 0; b < batches; b++)
+  {
+    for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++)
+    {
+      for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++)
+      {
+        int32_t x1 = std::min(x0 + 1, input_width - 1);
+        int32_t y1 = std::min(y0 + 1, input_height - 1);
+        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data,
+                                output_shape, output_data);
+      }
+    }
+  }
+}
+
+inline void ResizeBilinearKernel(const float *input_ptr, int32_t depth, float scale,
+                                 float *output_ptr)
+{
+  for (int32_t i = 0; i < depth; i++)
+  {
+    *output_ptr += *input_ptr * scale;
+    output_ptr++;
+    input_ptr++;
+  }
+}
+
+inline void ComputeInterpolationValues(const float value, const float scale,
+                                       const bool half_pixel_centers, int32_t input_size,
+                                       float *scaled_value, int32_t *lower_bound,
+                                       int32_t *upper_bound)
+{
+  if (half_pixel_centers)
+  {
+    *scaled_value = (value + 0.5f) * scale - 0.5f;
+  }
+  else
+  {
+    *scaled_value = value * scale;
+  }
+  float scaled_value_floor = std::floor(*scaled_value);
+  *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0));
+  *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
+}
+
+inline void ResizeBilinearGeneric(int32_t batches, int32_t input_height, int32_t input_width,
+                                  int32_t depth, int32_t output_height, int32_t output_width,
+                                  float height_scale, float width_scale, const Shape &input_shape,
+                                  const float *input_data, float *output_data,
+                                  const bool half_pixel_centers)
+{
+  memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float));
+
+  int32_t output_offset = 0;
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      float input_y;
+      int32_t y0, y1;
+      ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
+                                 &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        float input_x;
+        int32_t x0, x1;
+        ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
+                                   &x1);
+        float *output_ptr = &output_data[output_offset];
+
+        // Run kernel on the 4 corners of the bilinear resize algorithm.
+        int32_t input_offset = Offset(input_shape, b, y0, x0, 0);
+        float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
+        const float *input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y0, x1, 0);
+        scale = (1 - (input_y - y0)) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y1, x0, 0);
+        scale = (input_y - y0) * (1 - (input_x - x0));
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y1, x1, 0);
+        scale = (input_y - y0) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        output_offset += depth;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_height,
+                                              int32_t input_width, int32_t depth,
+                                              int32_t output_height, int32_t output_width,
+                                              float height_scale, float width_scale,
+                                              const Shape &input_shape, const T *input_data,
+                                              T *output_data, const bool half_pixel_centers)
+{
+  T *output_ptr = &output_data[0];
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      float input_y;
+      int32_t y0, y1;
+      ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
+                                 &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        float input_x;
+        int32_t x0, x1;
+        ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
+                                   &x1);
+
+        int32_t input_offset[4] = {
+            Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
+            Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
+        float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
+                          (1 - (input_y - y0)) * (input_x - x0),
+                          (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
+
+        for (int d = 0; d < depth; d++)
+        {
+          const T *input_ptr = &input_data[d];
+          *output_ptr++ = static_cast<T>(
+              input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
+              input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
+        }
+      }
+    }
+  }
+}
+
+void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  int32_t batches = static_cast<int32_t>(MatchingDim(input_shape, 0, output_shape, 0));
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = static_cast<int32_t>(MatchingDim(input_shape, 3, output_shape, 3));
+
+  // Specialize for 2x2 upsample.
+  if (!params.align_corners && !params.half_pixel_centers &&
+      params.output_height == 2 * input_height && params.output_width == 2 * input_width)
+  {
+    ResizeBilinear2x2(batches, input_height, input_width, depth, params.output_height,
+                      params.output_width, input_shape, input_data, output_shape, output_data);
+  }
+  else
+  {
+    float height_scale = static_cast<float>(input_height) / params.output_height;
+    float width_scale = static_cast<float>(input_width) / params.output_width;
+    if (params.align_corners && params.output_height > 1)
+    {
+      height_scale = static_cast<float>(input_height - 1) / (params.output_height - 1);
+    }
+    if (params.align_corners && params.output_width > 1)
+    {
+      width_scale = static_cast<float>(input_width - 1) / (params.output_width - 1);
+    }
+
+    ResizeBilinearGeneric(batches, input_height, input_width, depth, params.output_height,
+                          params.output_width, height_scale, width_scale, input_shape, input_data,
+                          output_data, params.half_pixel_centers);
+  }
+}
+
+void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
+                    const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  float height_scale = (params.align_corners && params.output_height > 1)
+                           ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
+                           : (static_cast<float>(input_height) / params.output_height);
+
+  float width_scale = (params.align_corners && params.output_width > 1)
+                          ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
+                          : (static_cast<float>(input_width) / params.output_width);
+
+  ResizeBilinearGenericSmallChannel<uint8_t>(
+      batches, input_height, input_width, depth, params.output_height, params.output_width,
+      height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RESIZEBILINEAR_H__
diff --git a/compute/cker/include/cker/operation/Reverse.h b/compute/cker/include/cker/operation/Reverse.h
new file mode 100644
index 000000000..ef4673f21
--- /dev/null
+++ b/compute/cker/include/cker/operation/Reverse.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REVERSE_H__
+#define __NNFW_CKER_REVERSE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+void Reverse(int axis, const Shape &input_shape, const Scalar *input_data, const Shape &,
+             Scalar *output_data)
+{
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < dims_at_axis; ++j)
+    {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar *output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REVERSE_H__
diff --git a/compute/cker/include/cker/operation/Round.h b/compute/cker/include/cker/operation/Round.h
new file mode 100644
index 000000000..a04a741cf
--- /dev/null
+++ b/compute/cker/include/cker/operation/Round.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ROUND_H__
+#define __NNFW_CKER_ROUND_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline float RoundToNearest(float value)
+{
+  auto floor_val = std::floor(value);
+  auto diff = value - floor_val;
+  if ((diff < 0.5f) || ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0)))
+  {
+    return floor_val;
+  }
+  else
+  {
+    return floor_val = floor_val + 1.0f;
+  }
+}
+
+inline void Round(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                  float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    // Note that this implementation matches that of tensorFlow tf.round
+    // and corresponds to the bankers rounding method.
+    // cfenv (for fesetround) is not yet supported universally on Android, so
+    // using a work around.
+    output_data[i] = RoundToNearest(input_data[i]);
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ROUND_H__
diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h
new file mode 100644
index 000000000..ab2de94cc
--- /dev/null
+++ b/compute/cker/include/cker/operation/Select.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SELECT_H__
+#define __NNFW_CKER_SELECT_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename D, typename T>
+void Select(const Shape &input_condition_shape, const D *input_condition_data,
+            const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape,
+            const T *input_y_data, const Shape &output_shape, T *output_data)
+{
+  const int64_t flatsize =
+      MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const Shape &input_condition_shape, const D *input_condition_data,
+                   const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape,
+                   const T *input_y_data, const Shape &output_shape, T *output_data)
+{
+  const int64_t outer_size = input_condition_shape.FlatSize();
+  assert(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0) == outer_size);
+  const int64_t inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < outer_size; i++)
+  {
+    const T *input_data = (input_condition_data[i] != 0) ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+template <typename D, typename T>
+void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_condition_data,
+                           const Shape &input_x_shape, const T *input_x_data,
+                           const Shape &input_y_shape, const T *input_y_data,
+                           const Shape &output_shape, T *output_data)
+{
+  assert(input_condition_shape.DimensionsCount() <= 4);
+  assert(input_x_shape.DimensionsCount() <= 4);
+  assert(input_y_shape.DimensionsCount() <= 4);
+  assert(output_shape.DimensionsCount() <= 4);
+
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  NdArrayDesc<4> desc_condition;
+  NdArrayDesc<4> desc_x;
+  NdArrayDesc<4> desc_y;
+  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
+                                      &desc_condition, &desc_x, &desc_y);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
+          const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
+          const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SELECT_H__
diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h
new file mode 100644
index 000000000..a072cff8e
--- /dev/null
+++ b/compute/cker/include/cker/operation/Slice.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SLICE_H__
+#define __NNFW_CKER_SLICE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void Slice(const SliceParams &op_params, const Shape &input_shape,
+                  SequentialTensorWriter<T> *writer)
+{
+  // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
+  assert(op_params.begin_count <= 4);
+  assert(op_params.size_count <= 4);
+
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
+  const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) ? input_shape.Dims(0)
+                                                                     : start_b + op_params.size[0];
+  const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
+  const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
+                         ? input_shape.Dims(1)
+                         : start_h + op_params.size[size_count - 3];
+  const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
+  const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
+                         ? input_shape.Dims(2)
+                         : start_w + op_params.size[size_count - 2];
+  const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
+  const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
+                         ? input_shape.Dims(3)
+                         : start_d + op_params.size[size_count - 1];
+
+  for (int in_b = start_b; in_b < stop_b; ++in_b)
+  {
+    for (int in_h = start_h; in_h < stop_h; ++in_h)
+    {
+      for (int in_w = start_w; in_w < stop_w; ++in_w)
+      {
+        const int len = stop_d - start_d;
+        if (len > 0)
+          writer->WriteN(Offset(input_shape, in_b, in_h, in_w, start_d), len);
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const SliceParams &op_params, const Shape &input_shape, const T *input_data,
+                  T *output_data)
+{
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, &writer);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SLICE_H__
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
new file mode 100644
index 000000000..0e0f364ba
--- /dev/null
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SOFTMAX_H__
+#define __NNFW_CKER_SOFTMAX_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+#include "cker/eigen/Utils.h"
+
+#include <Eigen/Core>
+#include <fixedpoint/fixedpoint.h>
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace reference
+{
+
+// Note. This Softmax function supports all of dimensions
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c)
+    {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c)
+    {
+      sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta));
+    }
+
+    // Compute result.
+    for (int c = 0; c < depth; ++c)
+    {
+      output_data[i * depth + c] =
+          std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
+    }
+  }
+}
+}
+
+// Performs softmax along the input of size (input_size * batch_size).
+inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
+                    float *out)
+{
+  assert(input_size > 0);
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++)
+  {
+    // Find the max coeff.
+    float max_coeff = in[0];
+    for (int i = 1; i < input_size; i++)
+    {
+      if (in[i] > max_coeff)
+        max_coeff = in[i];
+    }
+
+    // Compute the normalized sum of exps.
+    float exp_sum = 0.0;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] = std::exp((in[i] - max_coeff) * beta);
+      exp_sum += out[i];
+    }
+
+    // Divide by the sum of exps.
+    float reciprocal_sum_exp = 1.f / exp_sum;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] *= reciprocal_sum_exp;
+    }
+
+    // Advance in and out pointers for the next batch.
+    in += input_size;
+    out += input_size;
+  }
+}
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  // Validate whether if shapes of input and output are the same
+  MatchingFlatSize(input_shape, output_shape);
+
+  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+  // Compute the exponential first, removing the max coefficient for numerical
+  // stability.
+  out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
+  // We are separating out the exp function so that exp can be vectorized.
+  out_mat = out_mat.array().exp();
+  // Normalize to get the activations.
+  Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse();
+  out_mat.array().rowwise() *= scale;
+}
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
+                    const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    uint8_t max_in_row = 0;
+    for (int c = 0; c < depth; ++c)
+    {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c)
+    {
+      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min)
+      {
+        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
+            input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int32_t fixed_sum_of_exps = sum_of_exps.raw();
+    int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
+    // This is the number of bits to the left of the binary point above 1.0.
+    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+    // no later adjustment will be needed.
+    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+    int32_t shifted_sum_minus_one =
+        static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
+                             (static_cast<uint32_t>(1) << 31));
+
+    FixedPoint0 shifted_scale =
+        one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+    for (int c = 0; c < depth; ++c)
+    {
+      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min)
+      {
+        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
+            input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
+                                                             num_bits_over_unit + 31 - 8);
+
+        output_data[i * depth + c] = static_cast<uint8_t>(
+            std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
+      }
+      else
+      {
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SOFTMAX_H__
diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h
new file mode 100644
index 000000000..feeb358c9
--- /dev/null
+++ b/compute/cker/include/cker/operation/SpaceToBatchND.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPACE_TO_BATCH_ND_H__
+#define __NNFW_CKER_SPACE_TO_BATCH_ND_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void SpaceToBatchND(const SpaceToBatchParams &params, const Shape &unextended_input_shape,
+                           const T *input_data, const Shape &unextended_block_shape_shape,
+                           const int32_t *block_shape_data, const Shape &unextended_padding_shape,
+                           const int32_t *paddings_data, const Shape &unextended_output_shape,
+                           T *output_data)
+{
+  UNUSED_RELEASE(unextended_block_shape_shape);
+  UNUSED_RELEASE(unextended_padding_shape);
+
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_batch_size = input_shape.Dims(0);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width = block_shape_data[1];
+  const int padding_top = paddings_data[0];
+  const int padding_left = paddings_data[2];
+
+  // For uint8 quantized, the correct padding "zero value" is the output offset.
+  const int32_t pad_value = params.output_offset;
+
+  for (int out_b = 0; out_b < output_batch_size; ++out_b)
+  {
+    int input_batch = out_b % input_batch_size;
+    int shift_w = (out_b / input_batch_size) % block_shape_width;
+    int shift_h = (out_b / input_batch_size) / block_shape_width;
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      for (int out_w = 0; out_w < output_width; ++out_w)
+      {
+        T *out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
+        if (out_h * block_shape_height + shift_h < padding_top ||
+            out_h * block_shape_height + shift_h >= padding_top + input_height ||
+            out_w * block_shape_width + shift_w < padding_left ||
+            out_w * block_shape_width + shift_w >= padding_left + input_width)
+        {
+          // This may not execute correctly when pad_value != 0 and T != uint8.
+          memset(out, pad_value, depth * sizeof(T));
+        }
+        else
+        {
+          const T *in =
+              input_data + Offset(input_shape, input_batch,
+                                  (out_h * block_shape_height + shift_h) - padding_top,
+                                  (out_w * block_shape_width + shift_w) - padding_left, 0);
+          memcpy(out, in, depth * sizeof(T));
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_BATCH_ND_H__
diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h
new file mode 100644
index 000000000..ef679315e
--- /dev/null
+++ b/compute/cker/include/cker/operation/SpaceToDepth.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__
+#define __NNFW_CKER_SPACE_TO_DEPTH_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void SpaceToDepth(const SpaceToDepthParams &params, const Shape &unextended_input_shape,
+                         const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  const int input_depth = input_shape.Dims(3);
+  const int batch_size = input_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = params.block_size * input_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
+      for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
+      {
+        T *dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w)
+        {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/operation/Split.h b/compute/cker/include/cker/operation/Split.h
new file mode 100644
index 000000000..08a436ee9
--- /dev/null
+++ b/compute/cker/include/cker/operation/Split.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPLIT_H__
+#define __NNFW_CKER_SPLIT_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+void Split(const SplitParams &params, const Shape &input_shape, const Scalar *input_data,
+           const Shape &output_shape, Scalar *const *output_data)
+{
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      const int copy_size = output_shape.Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPLIT_H__
diff --git a/compute/cker/include/cker/operation/SplitV.h b/compute/cker/include/cker/operation/SplitV.h
new file mode 100644
index 000000000..9e46f4b04
--- /dev/null
+++ b/compute/cker/include/cker/operation/SplitV.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPLIT_V_H__
+#define __NNFW_CKER_SPLIT_V_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+void SplitV(const SplitVParams &params, const Shape &input_shape, const Scalar *input_data,
+            std::vector<nnfw::cker::Shape> &output_shapes, Scalar *const *output_data)
+{
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+
+  int64_t split_size = 0;
+
+  for (int i = 0; i < outputs_count; i++)
+  {
+    // TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        MatchingDim(output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i].Dims(axis);
+  }
+
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  int copy_size = 0;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      copy_size = output_shapes[i].Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPLIT_V_H__
diff --git a/compute/cker/include/cker/operation/SqDiff.h b/compute/cker/include/cker/operation/SqDiff.h
new file mode 100644
index 000000000..93428d5fd
--- /dev/null
+++ b/compute/cker/include/cker/operation/SqDiff.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REDUCESQDIFF_H__
+#define __NNFW_CKER_REDUCESQDIFF_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+#define SQDIFF(N)                                                                        \
+  do                                                                                     \
+  {                                                                                      \
+    NdArrayDesc<N> input1_desc;                                                          \
+    NdArrayDesc<N> input2_desc;                                                          \
+    NdArrayDesc<N> output_desc;                                                          \
+    SqDiffImpl<T, N>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \
+                     output_data, &input1_desc, &input2_desc, &output_desc);             \
+  } while (0);
+
+template <typename T, int N>
+void SqDiffImpl(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+                const T *input2_data, const Shape &output_shape, T *output_data,
+                NdArrayDesc<N> *desc1_in, NdArrayDesc<N> *desc2_in, NdArrayDesc<N> *desc_out)
+{
+  std::vector<int> input_iter;
+  input_iter.resize(N);
+  const auto output_dims = output_shape.DimsData();
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(output_shape, desc_out);
+  NdArrayDescsForElementwiseBroadcast<N>(input1_shape, input2_shape, desc1_in, desc2_in);
+
+  do
+  {
+    int input1_indx = SubscriptToIndexGeneric(desc1_in, input_iter.data());
+    int input2_indx = SubscriptToIndexGeneric(desc2_in, input_iter.data());
+    int output_indx = SubscriptToIndexGeneric(desc_out, input_iter.data());
+    output_data[output_indx] = (input1_data[input1_indx] - input2_data[input2_indx]) *
+                               (input1_data[input1_indx] - input2_data[input2_indx]);
+  } while (NextIndex(N, output_dims, input_iter.data()));
+}
+
+template <typename T>
+void SqDiff(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+            const T *input2_data, const Shape &output_shape, T *output_data)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input1_shape.DimensionsCount() > 0 && input2_shape.DimensionsCount() > 0 &&
+         output_shape.DimensionsCount() > 0);
+  int outRank = output_shape.DimensionsCount();
+
+  switch (outRank)
+  {
+    case 4:
+      SQDIFF(4);
+      break;
+
+    case 3:
+      SQDIFF(3);
+      break;
+
+    case 2:
+      SQDIFF(2);
+      break;
+
+    case 1:
+      SQDIFF(1);
+      break;
+
+    default:
+      throw std::runtime_error("Support up to 4-D tensors at present");
+      break;
+  }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REDUCESQDIFF_H__
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
new file mode 100644
index 000000000..d5952ae23
--- /dev/null
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__
+#define __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+
+#include "cker/operation/Helper/Tensor.h"
+#include "cker/operation/Helper/PhiloxRandom.h"
+#include "cker/operation/Helper/RandomOpCpu.h"
+#include "cker/operation/Helper/RandomDistributions.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+void GenerateKey(Tensor seed, random::PhiloxRandom::Key *out_key,
+                 random::PhiloxRandom::ResultType *out_counter)
+{
+  // Grab the two seeds
+  uint32_t seed0;
+  uint32_t seed1;
+
+  const auto seed_vals = seed.flat<int32_t>();
+
+  seed0 = seed_vals(0);
+  seed1 = seed_vals(1);
+  // Scramble the seeds so that the user doesn't need to worry about which
+  // part of the seed needs to be strong.
+  (*out_key)[0] = 0x3ec8f720;
+  (*out_key)[1] = 0x02461e29;
+  (*out_counter)[0] = static_cast<uint32_t>(seed0);
+  (*out_counter)[1] = (*out_counter)[3] = 0;
+  (*out_counter)[2] = static_cast<uint32_t>(seed1);
+  const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
+  (*out_key)[0] = mix[0];
+  (*out_key)[1] = mix[1];
+  (*out_counter)[0] = (*out_counter)[1] = 0;
+  (*out_counter)[2] = mix[2];
+  (*out_counter)[3] = mix[3];
+}
+
+template <typename Device, class Distribution>
+void Fill(random::PhiloxRandom random, Tensor *output)
+{
+  // Build distribution
+  typedef typename Distribution::ResultElementType T;
+
+  auto flat = output->flat<T>();
+  // Reuse the compute kernels from the stateful random ops
+  functor::FillPhiloxRandom<Device, Distribution>()(random, flat.data(), flat.size(),
+                                                    Distribution());
+}
+
+inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data,
+                                   const Shape &seed_shape, const int *seed_data,
+                                   const Shape &output_shape, float *output_data)
+{
+  Tensor shape_t;
+  Tensor seed_t;
+
+  shape_t.shape.ReplaceWith(shape_shape.DimensionsCount(), shape_shape.DimsData());
+  shape_t.buffer = (void *)shape_data;
+
+  seed_t.shape.ReplaceWith(seed_shape.DimensionsCount(), seed_shape.DimsData());
+  seed_t.buffer = (void *)seed_data;
+
+  Tensor output_t;
+  output_t.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
+  output_t.buffer = output_data;
+
+  random::PhiloxRandom::Key key;
+  random::PhiloxRandom::ResultType counter;
+
+  GenerateKey(seed_t, &key, &counter);
+
+  Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
+      random::PhiloxRandom(counter, key), &output_t);
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__
diff --git a/compute/cker/include/cker/operation/StridedSlice.h b/compute/cker/include/cker/operation/StridedSlice.h
new file mode 100644
index 000000000..2f1089575
--- /dev/null
+++ b/compute/cker/include/cker/operation/StridedSlice.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_STRIDEDSLICE_H__
+#define __NNFW_CKER_STRIDEDSLICE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi)
+{
+  assert(!(hi < lo));
+  if (hi < v)
+    return hi;
+  if (v < lo)
+    return lo;
+  return v;
+}
+
+inline void StridedSlicePadIndices(StridedSliceParams *p, int dim_count)
+{
+  // Add indices and mask bits to fully include extra dimensions
+  assert(dim_count <= 4);
+  assert(dim_count >= p->start_indices_count);
+  assert(p->start_indices_count == p->stop_indices_count);
+  assert(p->stop_indices_count == p->strides_count);
+
+  const int pad_count = dim_count - p->start_indices_count;
+
+  // Pad indices at start, so move arrays by pad_count.
+  for (int i = p->start_indices_count - 1; i >= 0; --i)
+  {
+    p->strides[i + pad_count] = p->strides[i];
+    p->start_indices[i + pad_count] = p->start_indices[i];
+    p->stop_indices[i + pad_count] = p->stop_indices[i];
+  }
+  for (int i = 0; i < pad_count; ++i)
+  {
+    p->start_indices[i] = 0;
+    p->stop_indices[i] = 1;
+    p->strides[i] = 1;
+  }
+
+  // Pad masks with 0s or 1s as required.
+  p->shrink_axis_mask <<= pad_count;
+  p->ellipsis_mask <<= pad_count;
+  p->new_axis_mask <<= pad_count;
+  p->begin_mask <<= pad_count;
+  p->end_mask <<= pad_count;
+  p->begin_mask |= (1 << pad_count) - 1;
+  p->end_mask |= (1 << pad_count) - 1;
+
+  p->start_indices_count = dim_count;
+  p->stop_indices_count = dim_count;
+  p->strides_count = dim_count;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size - 1] that can be used to index
+// directly into the data.
+inline int StartForAxis(const StridedSliceParams &params, const Shape &input_shape, int axis)
+{
+  const auto begin_mask = params.begin_mask;
+  const auto *start_indices = params.start_indices;
+  const auto *strides = params.strides;
+  // Begin with the specified index.
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis)
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    }
+    else
+    {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.Dims(axis);
+  if (start < 0)
+  {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int StopForAxis(const StridedSliceParams &params, const Shape &input_shape, int axis,
+                       int start_for_axis)
+{
+  const auto end_mask = params.end_mask;
+  const auto shrink_axis_mask = params.shrink_axis_mask;
+  const auto *stop_indices = params.stop_indices;
+  const auto *strides = params.strides;
+
+  // Begin with the specified index
+  const bool shrink_axis = shrink_axis_mask & (1 << axis);
+  int stop = stop_indices[axis];
+
+  // When shrinking an axis, the end position does not matter (and can be
+  // incorrect when negative indexing is used, see Issue #19260). Always use
+  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
+  // already been adjusted for negative indices.
+  if (shrink_axis)
+  {
+    stop = start_for_axis + 1;
+  }
+
+  // end_mask override
+  if (end_mask & (1 << axis))
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    }
+    else
+    {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  const int axis_size = input_shape.Dims(axis);
+  if (stop < 0)
+  {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0)
+  {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  }
+  else
+  {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride)
+{
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
+template <typename T>
+inline StridedSliceParams
+buildStridedSliceParams(const T *begin, const T *end, const T *strides, const uint32_t begin_mask,
+                        const uint32_t end_mask, const uint32_t shrink_axis_mask,
+                        const uint8_t rank)
+{
+  StridedSliceParams op_params;
+  op_params.start_indices_count = rank;
+  op_params.stop_indices_count = rank;
+  op_params.strides_count = rank;
+
+  for (int i = 0; i < rank; ++i)
+  {
+    op_params.start_indices[i] = begin[i];
+    op_params.stop_indices[i] = end[i];
+    op_params.strides[i] = strides[i];
+
+    assert(op_params.strides[i] != 0);
+  }
+
+  op_params.begin_mask = begin_mask;
+  op_params.ellipsis_mask = 0; // NYI
+  op_params.end_mask = end_mask;
+  op_params.new_axis_mask = 0; // NYI
+  op_params.shrink_axis_mask = shrink_axis_mask;
+
+  assert(sizeof(op_params.begin_mask) * 4 >= rank);
+
+  return op_params;
+}
+
+void checkOutputSize(const StridedSliceParams &op_params, const Shape &input_shape,
+                     const Shape &output_shape, uint32_t rank)
+{
+  UNUSED_RELEASE(output_shape);
+
+  int32_t shape_size = 0;
+
+  for (uint32_t idx = 0; idx < rank; ++idx)
+  {
+    int32_t stride = op_params.strides[idx];
+    int32_t begin = StartForAxis(op_params, input_shape, idx);
+    int32_t end = StopForAxis(op_params, input_shape, idx, begin);
+
+    // When shrinking an axis, the end position does not matter (and can be
+    // incorrect when negative indexing is used, see Issue #19260). Always use
+    // begin + 1 to generate a length 1 slice, since begin has
+    // already been adjusted for negative indices by StartForAxis.
+    const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
+    if (shrink_axis)
+    {
+      end = begin + 1;
+    }
+
+    int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!shrink_axis)
+    {
+      assert(output_shape.Dims(shape_size) == dim_shape);
+      shape_size++;
+    }
+  }
+
+  assert(output_shape.DimensionsCount() == shape_size);
+}
+
+template <typename T>
+inline void StridedSlice(const StridedSliceParams &op_params, const Shape &unextended_input_shape,
+                         const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+
+  bool optimize = true;
+  int st_count = op_params.strides_count;
+  for (int idx = 0; idx < st_count - 1; idx++)
+  {
+    const int axis_size = unextended_input_shape.Dims(idx);
+    const int start = StartForAxis(op_params, unextended_input_shape, idx);
+    const int stop = StopForAxis(op_params, unextended_input_shape, idx, start);
+    if ((axis_size != 1) && (start != 0 || stop != 0))
+    {
+      optimize = false;
+      break;
+    }
+  }
+
+  if (optimize)
+  {
+    if (op_params.strides[st_count - 1] == 1)
+    {
+      const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1);
+      const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start);
+
+      for (int idx = 0; idx < end - start; idx++)
+      {
+        output_data[idx] = input_data[idx + start];
+      }
+      return;
+    }
+  }
+
+  // Note that the output_shape is not used herein.
+  StridedSliceParams params_copy = op_params;
+
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  // Reverse and pad to 4 dimensions because that is what the runtime code
+  // requires (ie. all shapes must be 4D and are given backwards).
+  StridedSlicePadIndices(&params_copy, 4);
+
+  const int start_b = StartForAxis(params_copy, input_shape, 0);
+  const int stop_b = StopForAxis(params_copy, input_shape, 0, start_b);
+  const int start_h = StartForAxis(params_copy, input_shape, 1);
+  const int stop_h = StopForAxis(params_copy, input_shape, 1, start_h);
+  const int start_w = StartForAxis(params_copy, input_shape, 2);
+  const int stop_w = StopForAxis(params_copy, input_shape, 2, start_w);
+  const int start_d = StartForAxis(params_copy, input_shape, 3);
+  const int stop_d = StopForAxis(params_copy, input_shape, 3, start_d);
+
+  T *out_ptr = output_data;
+  for (int in_b = start_b; !LoopCondition(in_b, stop_b, params_copy.strides[0]);
+       in_b += params_copy.strides[0])
+  {
+    for (int in_h = start_h; !LoopCondition(in_h, stop_h, params_copy.strides[1]);
+         in_h += params_copy.strides[1])
+    {
+      for (int in_w = start_w; !LoopCondition(in_w, stop_w, params_copy.strides[2]);
+           in_w += params_copy.strides[2])
+      {
+        for (int in_d = start_d; !LoopCondition(in_d, stop_d, params_copy.strides[3]);
+             in_d += params_copy.strides[3])
+        {
+          *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)];
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_STRIDEDSLICE_H__
diff --git a/compute/cker/include/cker/operation/Tanh.h b/compute/cker/include/cker/operation/Tanh.h
new file mode 100644
index 000000000..8747d52b4
--- /dev/null
+++ b/compute/cker/include/cker/operation/Tanh.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TANH_H__
+#define __NNFW_CKER_TANH_H__
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Tanh(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                 float *output_data)
+{
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = input_map.array().tanh();
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TANH_H__
diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h
new file mode 100644
index 000000000..1dcdd9b79
--- /dev/null
+++ b/compute/cker/include/cker/operation/Tile.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TILE_H__
+#define __NNFW_CKER_TILE_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T, typename M>
+void CopyMultipleTimes(const T *in_data, int32_t in_size, M multiplier, T *out_data)
+{
+  for (M i = 0; i < multiplier; ++i)
+  {
+    const T *in_end = in_data + in_size;
+    T *new_out_data = std::copy(in_data, in_end, out_data);
+    in_data = out_data;
+    out_data = new_out_data;
+  }
+}
+
+template <typename T, typename M>
+std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_data,
+                                     const M *multipliers, T *out_data, int dimension)
+{
+  const int dimension_size = in_dimensions.Dims(dimension);
+  if (dimension == in_dimensions.DimensionsCount() - 1)
+  {
+    CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
+    return std::make_pair(dimension_size,
+                          dimension_size * static_cast<int>(multipliers[dimension]));
+  }
+  int total_stride_size = 0, total_tiled_stride_size = 0;
+  const T *copy_from_data = in_data;
+  T *copy_to_data = out_data;
+  for (int i = 0; i < dimension_size; ++i)
+  {
+    int stride_size = 0, tiled_stride_size = 0;
+    std::tie(stride_size, tiled_stride_size) =
+        TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
+    copy_from_data += stride_size;
+    copy_to_data += tiled_stride_size;
+    total_stride_size += stride_size;
+    total_tiled_stride_size += tiled_stride_size;
+  }
+  CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1,
+                    out_data + total_tiled_stride_size);
+  return std::make_pair(total_stride_size,
+                        static_cast<int>(total_tiled_stride_size * multipliers[dimension]));
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TILE_H__
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
new file mode 100644
index 000000000..9d8cd340d
--- /dev/null
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRANSPOSE_H__
+#define __NNFW_CKER_TRANSPOSE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+template <typename T>
+void TransposeImpl(const TransposeParams &params, const Shape &unextended_input_shape,
+                   const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+  const int unextended_output_size = unextended_output_shape.DimensionsCount();
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_size <= 4);
+  assert(unextended_output_size == params.perm_count);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+  const int input_ext_size = 4 - unextended_input_shape.DimensionsCount();
+  const int output_ext_size = 4 - unextended_output_size;
+
+  // The perm data is extended to match the output, each index incremented by
+  // the amount of front padding of the input shape.
+  int extended_perm[4];
+  for (int i = 0; i < output_ext_size; ++i)
+  {
+    extended_perm[i] = i;
+  }
+  for (int i = 0; i < unextended_output_size; ++i)
+  {
+    extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size;
+  }
+
+  int out_sizes[4];
+  // Compute the inverse permutation array so we can do an output centered
+  // transpose. Also, check to make sure output_dims is matching input_dims.
+  for (int k = 0; k < 4; k++)
+  {
+    out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k);
+  }
+
+  // Naive transpose loop (iterate on output index and compute input index).
+  int o[4]; // loop index (on output).
+  int i[4];
+  for (o[3] = 0; o[3] < out_sizes[3]; o[3]++)
+  {
+    i[extended_perm[3]] = o[3];
+    for (o[2] = 0; o[2] < out_sizes[2]; o[2]++)
+    {
+      i[extended_perm[2]] = o[2];
+      for (o[1] = 0; o[1] < out_sizes[1]; o[1]++)
+      {
+        i[extended_perm[1]] = o[1];
+        for (o[0] = 0; o[0] < out_sizes[0]; o[0]++)
+        {
+          i[extended_perm[0]] = o[0];
+          output_data[Offset(output_shape, o)] = input_data[Offset(input_shape, i)];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Transpose(const TransposeParams &params, const Shape &unextended_input_shape,
+               const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+  // Transpose kernel only does rearranging values not numeric evaluations on
+  // each cell. It's safe to implement per size of scalar type and this trick
+  // keeps the total code size in a reasonable range.
+  switch (sizeof(T))
+  {
+    case 1:
+      TransposeImpl<int8_t>(params, unextended_input_shape,
+                            reinterpret_cast<const int8_t *>(input_data), unextended_output_shape,
+                            reinterpret_cast<int8_t *>(output_data));
+      break;
+    case 2:
+      TransposeImpl<int16_t>(params, unextended_input_shape,
+                             reinterpret_cast<const int16_t *>(input_data), unextended_output_shape,
+                             reinterpret_cast<int16_t *>(output_data));
+      break;
+
+    case 4:
+      TransposeImpl<int32_t>(params, unextended_input_shape,
+                             reinterpret_cast<const int32_t *>(input_data), unextended_output_shape,
+                             reinterpret_cast<int32_t *>(output_data));
+      break;
+    case 8:
+      TransposeImpl<int64_t>(params, unextended_input_shape,
+                             reinterpret_cast<const int64_t *>(input_data), unextended_output_shape,
+                             reinterpret_cast<int64_t *>(output_data));
+      break;
+  }
+}
+} // namespace reference
+
+namespace
+{
+
+bool IsTranspose2DApplicable(const TransposeParams &params, const Shape &input_shape, int *dim0,
+                             int *dim1)
+{
+  const int dims_cnt = input_shape.DimensionsCount();
+
+  if (dims_cnt == 2)
+  {
+    *dim0 = input_shape.Dims(0);
+    *dim1 = input_shape.Dims(1);
+    return true;
+  }
+
+  const int first_perm = params.perm[0];
+  for (int i = 1; i < dims_cnt; ++i)
+  {
+    int rebased = params.perm[i] - first_perm;
+    if (rebased < 0)
+    {
+      rebased += dims_cnt;
+    }
+    if (rebased != i)
+    {
+      return false;
+    }
+  }
+  *dim0 = 1;
+  *dim1 = 1;
+  for (int i = 0; i < dims_cnt; ++i)
+  {
+    if (i < first_perm)
+    {
+      *dim0 *= input_shape.Dims(i);
+    }
+    else
+    {
+      *dim1 *= input_shape.Dims(i);
+    }
+  }
+  return true;
+}
+
+void RemoveOneSizeDimensions(Shape *input_shape, Shape *output_shape, TransposeParams *params)
+{
+  const int dims_cnt = input_shape->DimensionsCount();
+  assert(params->perm_count == dims_cnt);
+
+  bool foundOneSizeDim = false;
+  for (int i = 0; i < dims_cnt; ++i)
+  {
+    if (input_shape->Dims(i) == 1)
+    {
+      foundOneSizeDim = true;
+      break;
+    }
+  }
+
+  // Return here if there is no one size dimension.
+  if (!foundOneSizeDim)
+    return;
+
+  // Handle the case where all the dimension size is one.
+  if (input_shape->FlatSize() == 1)
+  {
+    input_shape->Resize(1);
+    input_shape->SetDim(0, 1);
+    output_shape->Resize(1);
+    output_shape->SetDim(0, 1);
+    params->perm_count = 1;
+    params->perm[0] = 0;
+    return;
+  }
+
+  // Resize input shape.
+  int new_dims_cnt = 0;
+  for (int i = 0; i < dims_cnt; ++i)
+  {
+    if (input_shape->Dims(i) == 1)
+    {
+      continue;
+    }
+    input_shape->SetDim(new_dims_cnt, input_shape->Dims(i));
+    ++new_dims_cnt;
+  }
+  input_shape->Resize(new_dims_cnt);
+
+  // Resize output shape and re-calculate the perm parameter.
+  TransposeParams new_params;
+  new_dims_cnt = 0;
+  for (int i = 0; i < dims_cnt; ++i)
+  {
+    if (output_shape->Dims(i) == 1)
+    {
+      continue;
+    }
+    new_params.perm[new_dims_cnt] = params->perm[i];
+    output_shape->SetDim(new_dims_cnt, output_shape->Dims(i));
+    ++new_dims_cnt;
+  }
+  output_shape->Resize(new_dims_cnt);
+  new_params.perm_count = new_dims_cnt;
+
+  for (int i = 0; i < new_dims_cnt; ++i)
+  {
+    int min_val_idx = -1;
+    for (int j = 0; j < new_dims_cnt; ++j)
+    {
+      if (new_params.perm[j] >= i &&
+          (min_val_idx == -1 || new_params.perm[min_val_idx] > new_params.perm[j]))
+      {
+        min_val_idx = j;
+      }
+    }
+    new_params.perm[min_val_idx] = i;
+  }
+  *params = new_params;
+}
+
+size_t Flatten(const Shape &input_shape, const Shape &output_shape, const TransposeParams &params,
+               Shape *non_flatten_input_shape, Shape *non_flatten_output_shape,
+               TransposeParams *non_flatten_params)
+{
+  // Calculate the total size of non-flatten dimensions.
+  int skip_dims_cnt = 0;
+  size_t flat_size = input_shape.FlatSize();
+  for (int i = 0; i < params.perm_count; ++i)
+  {
+    if (params.perm[i] == i)
+    {
+      flat_size /= input_shape.Dims(i);
+      ++skip_dims_cnt;
+    }
+    else
+    {
+      break;
+    }
+  }
+
+  // Shrink the shapes and re-calculate the perm parameter.
+  const int new_dims_cnt = params.perm_count - skip_dims_cnt;
+  non_flatten_input_shape->Resize(new_dims_cnt);
+  non_flatten_output_shape->Resize(new_dims_cnt);
+  non_flatten_params->perm_count = new_dims_cnt;
+
+  for (int i = skip_dims_cnt; i < params.perm_count; ++i)
+  {
+    non_flatten_input_shape->SetDim(i - skip_dims_cnt, input_shape.Dims(i));
+    non_flatten_output_shape->SetDim(i - skip_dims_cnt, output_shape.Dims(i));
+    non_flatten_params->perm[i - skip_dims_cnt] = params.perm[i];
+  }
+  for (int i = 0; i < new_dims_cnt; ++i)
+  {
+    int min_val_idx = -1;
+    for (int j = 0; j < new_dims_cnt; ++j)
+    {
+      if (non_flatten_params->perm[j] >= i &&
+          (min_val_idx == -1 ||
+           non_flatten_params->perm[min_val_idx] > non_flatten_params->perm[j]))
+      {
+        min_val_idx = j;
+      }
+    }
+    non_flatten_params->perm[min_val_idx] = i;
+  }
+
+  return flat_size;
+}
+
+} // namespace anonymous (util)
+
+// Transpose2D only deals with typical 2D matrix transpose ops.
+// Perform transpose by transposing 4x4 blocks of the input, proceeding from
+// left to right (down the rows) of the input, and then from top to bottom.
+template <typename T>
+inline void Transpose2D(const Shape &input_shape, const T *input_data, const Shape &output_shape,
+                        T *output_data)
+{
+  assert(input_shape.DimensionsCount() == 2);
+  assert(output_shape.DimensionsCount() == 2);
+  UNUSED_RELEASE(output_shape);
+
+  const int d0 = input_shape.DimsData()[0];
+  const int d1 = input_shape.DimsData()[1];
+  const int kLines = 4;
+  const int kSkipSize = (kLines - 1) * d1;
+
+  const T *input = input_data;
+
+  int i = 0;
+  for (; i <= d0 - kLines; i += kLines)
+  {
+    T *output = output_data + i;
+
+    const T *input_ptr = input;
+    optimized_ops_preload_l1_keep(input_ptr);
+    input_ptr += d1;
+    optimized_ops_preload_l1_keep(input_ptr);
+    input_ptr += d1;
+    optimized_ops_preload_l1_keep(input_ptr);
+    input_ptr += d1;
+    optimized_ops_preload_l1_keep(input_ptr);
+
+    int j = 0;
+    for (; j <= d1 - kLines; j += kLines)
+    {
+      input_ptr = input;
+      const T a00 = input_ptr[0];
+      const T a01 = input_ptr[1];
+      const T a02 = input_ptr[2];
+      const T a03 = input_ptr[3];
+      input_ptr += d1;
+      const T a10 = input_ptr[0];
+      const T a11 = input_ptr[1];
+      const T a12 = input_ptr[2];
+      const T a13 = input_ptr[3];
+      input_ptr += d1;
+      const T a20 = input_ptr[0];
+      const T a21 = input_ptr[1];
+      const T a22 = input_ptr[2];
+      const T a23 = input_ptr[3];
+      input_ptr += d1;
+      const T a30 = input_ptr[0];
+      const T a31 = input_ptr[1];
+      const T a32 = input_ptr[2];
+      const T a33 = input_ptr[3];
+
+      output[0] = a00;
+      output[1] = a10;
+      output[2] = a20;
+      output[3] = a30;
+      output += d0;
+
+      output[0] = a01;
+      output[1] = a11;
+      output[2] = a21;
+      output[3] = a31;
+      output += d0;
+
+      output[0] = a02;
+      output[1] = a12;
+      output[2] = a22;
+      output[3] = a32;
+      output += d0;
+
+      output[0] = a03;
+      output[1] = a13;
+      output[2] = a23;
+      output[3] = a33;
+      output += d0;
+
+      input += kLines;
+    }
+    if (j == d1)
+    {
+      input += kSkipSize;
+    }
+    else
+    {
+      for (int p = 0; p < kLines; ++p)
+      {
+        for (int q = 0; q < d1 - j; ++q)
+        {
+          *(output + q * d0 + p) = *(input + p * d1 + q);
+        }
+      }
+      input += (d1 - j) + kSkipSize;
+    }
+  }
+  for (; i < d0; ++i)
+  {
+    T *output = output_data + i;
+    for (int j = 0; j < d1; ++j)
+    {
+      *output = *input;
+      output += d0;
+      ++input;
+    }
+  }
+}
+
+// TODO(alanchiao): see if we can reduce the number
+// of lines of code in branching without affecting latency.
+template <typename T>
+inline void Transpose3D(const TransposeParams &params, const Shape &input_shape,
+                        const T *input_data, const Shape &, T *output_data)
+{
+  int s2, s3;
+  s2 = input_shape.Dims(1);
+  s3 = input_shape.Dims(2);
+
+  int p1 = 0;
+  int p2 = 0;
+  int p3 = 0;
+
+  if (params.perm[0] == 2)
+  {
+    p1 = 1;
+  }
+  else if (params.perm[1] == 2)
+  {
+    p2 = 1;
+  }
+  else
+  {
+    p3 = 1;
+  }
+
+  if (params.perm[0] == 1)
+  {
+    p1 = s3;
+  }
+  else if (params.perm[1] == 1)
+  {
+    p2 = s3;
+  }
+  else
+  {
+    p3 = s3;
+  }
+
+  if (params.perm[0] == 0)
+  {
+    p1 = s2 * s3;
+  }
+  else if (params.perm[1] == 0)
+  {
+    p2 = s2 * s3;
+  }
+  else
+  {
+    p3 = s2 * s3;
+  }
+
+  int o_s[3];
+  o_s[0] = input_shape.Dims(params.perm[0]);
+  o_s[1] = input_shape.Dims(params.perm[1]);
+  o_s[2] = input_shape.Dims(params.perm[2]);
+
+  for (int i1 = 0; i1 < o_s[0]; ++i1)
+  {
+    for (int i2 = 0; i2 < o_s[1]; ++i2)
+    {
+      for (int i3 = 0; i3 < o_s[2]; ++i3)
+      {
+        const int i = i1 * p1 + i2 * p2 + i3 * p3;
+        const int o = i1 * o_s[1] * o_s[2] + i2 * o_s[2] + i3;
+        output_data[o] = input_data[i];
+      }
+    }
+  }
+}
+
+template <typename T>
+void TransposeImpl(const TransposeParams &params, const Shape &input_shape, const T *input_data,
+                   const Shape &output_shape, T *output_data)
+{
+  const int dims_cnt = input_shape.DimensionsCount();
+
+  int dim0, dim1;
+  if (IsTranspose2DApplicable(params, input_shape, &dim0, &dim1))
+  {
+    Transpose2D(Shape({dim0, dim1}), input_data, Shape({dim1, dim0}), output_data);
+    return;
+  }
+
+  // TODO(b/141217325): notably Eigen is better suited for
+  // larger inputs whereas Transpose3D is generally
+  // better for smaller ones.
+  //
+  // E.g. on Nexus 5, Eigen is better for size 96^3 and up
+  // and Transpose3D is better for 72^3 and down.
+  //
+  // 96^3 is not mobile-friendly for certain usecases
+  // (e.g. model used in beam search for seq2seq) but is in others.
+  // Consider tradeoffs.
+  if (dims_cnt == 3)
+  {
+    Transpose3D(params, input_shape, input_data, output_shape, output_data);
+    return;
+  }
+
+  // Reroute to the reference version if an optimized method for the given data
+  // is not available.
+  reference::Transpose(params, input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_input_shape,
+               const T *input_data, const Shape &unshrunk_output_shape, T *output_data)
+{
+  const int output_size = unshrunk_output_shape.DimensionsCount();
+  assert(unshrunk_input_shape.DimensionsCount() <= 4);
+  assert(output_size <= 4);
+  assert(output_size == unshrunk_params.perm_count);
+
+  Shape shrunk_input_shape = Shape(unshrunk_input_shape);
+
+  Shape shrunk_output_shape = Shape(unshrunk_output_shape);
+
+  TransposeParams shrunk_params = unshrunk_params;
+
+  // Reduce any dimensions that have one size. Lower transpose op usually
+  // performs better since memory access patterns will be improved.
+  RemoveOneSizeDimensions(&shrunk_input_shape, &shrunk_output_shape, &shrunk_params);
+
+  // Handle identity cases.
+  // TODO(b/140779653): Add an optimization pass in the conversion process to
+  // remove transpose op nodes where they do nothing like the below one.
+  bool identical = true;
+  for (int i = 0; i < shrunk_params.perm_count; ++i)
+
+  {
+    if (shrunk_params.perm[i] != i)
+
+    {
+      identical = false;
+      break;
+    }
+  }
+  if (identical)
+  {
+    memcpy(output_data, input_data, unshrunk_input_shape.FlatSize() * sizeof(T));
+    return;
+  }
+
+  // Reduce dimensions by flattening.
+  if (shrunk_params.perm[0] == 0 && output_size >= 3)
+
+  {
+    Shape non_flatten_input_shape;
+    Shape non_flatten_output_shape;
+    TransposeParams non_flatten_params;
+    const int total_size = shrunk_input_shape.FlatSize();
+
+    const int non_flatten_size =
+        Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
+
+                &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
+    assert(non_flatten_params.perm[0] != 0);
+
+    for (int i = 0; i < total_size; i += non_flatten_size)
+    {
+      TransposeImpl(non_flatten_params, non_flatten_input_shape, input_data + i,
+                    non_flatten_output_shape, output_data + i);
+    }
+    return;
+  }
+
+  // Call non-flattened case.
+  TransposeImpl(shrunk_params, shrunk_input_shape, input_data, shrunk_output_shape,
+
+                output_data);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRANSPOSE_H__
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
new file mode 100644
index 000000000..7db3a1179
--- /dev/null
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRANSPOSE_CONV_H__
+#define __NNFW_CKER_TRANSPOSE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void TransposeConv(const TransposeConvParams &params, const Shape &input_shape,
+                          const float *input_data, const Shape &filter_shape,
+                          const float *filter_data, const Shape &output_shape, float *output_data)
+{
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; i++)
+  {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int in_y = 0; in_y < input_height; ++in_y)
+    {
+      for (int in_x = 0; in_x < input_width; ++in_x)
+      {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+        {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+              {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height))
+                {
+                  float input_value =
+                      input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
+                                                          filter_x, in_channel)];
+                  output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRANSPOSE_CONV_H__
diff --git a/compute/cker/include/cker/operation/Unpack.h b/compute/cker/include/cker/operation/Unpack.h
new file mode 100644
index 000000000..242aadf46
--- /dev/null
+++ b/compute/cker/include/cker/operation/Unpack.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UNPACK_H__
+#define __NNFW_CKER_UNPACK_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+void Unpack(const UnpackParams &params, const Shape &input_shape, const Scalar *input_data,
+            const Shape &output_shape, Scalar *const *output_datas)
+{
+  const int dimensions = input_shape.DimensionsCount();
+  const int outputs_count = params.num_split;
+
+  int outer_size = 1;
+  for (int i = 0; i < params.axis; i++)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+  assert(output_shape.FlatSize() == copy_size * outer_size);
+  UNUSED_RELEASE(output_shape);
+
+  for (int i = 0; i < outputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      Scalar *output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UNPACK_H__
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
new file mode 100644
index 000000000..912b01a64
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__
+#define __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__
+
+#include <functional>
+#include <limits>
+#include <utility>
+#include "cker/neon/neon_check.h"
+#include "cker/operation/reference/BinaryArithmeticOps.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "fixedpoint/fixedpoint.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs,
+                                    const Shape & /* unswitched_input1_shape */,
+                                    const T *unswitched_input1_data,
+                                    const Shape & /* unswitched_input2_shape */,
+                                    const T *unswitched_input2_data,
+                                    const Shape & /* output_shape */, T *output_data,
+                                    ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
+{
+  const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data;
+  const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T *output_data_ptr = output_data;
+  const T *input1_data_ptr = input1_data;
+  const T *input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1)
+  {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          for (int i3 = 0; i3 < y3; ++i3)
+          {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+  else
+  {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
+                          const uint8_t input2_data)
+{
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t input2_val = params.input2_offset + input2_data;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+  const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+  const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                 raw_sum, params.output_multiplier, params.output_shift) +
+                             params.output_offset;
+  const int32_t clamped_output = std::min(params.quantized_activation_max,
+                                          std::max(params.quantized_activation_min, raw_output));
+  return clamped_output;
+}
+
+inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
+                                 const uint8_t *input1_data, const uint8_t *input2_data,
+                                 uint8_t *output_data)
+{
+  int i = 0;
+
+#ifdef USE_NEON
+  const uint8x8_t output_activation_min_vector = vdup_n_u8(params.quantized_activation_min);
+  const uint8x8_t output_activation_max_vector = vdup_n_u8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8)
+  {
+    const uint8x8_t input1_val_original = vld1_u8(input1_data + i);
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input1_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const int16x8_t input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high = vget_high_s16(input1_val);
+    const int16x4_t input1_val_low = vget_low_s16(input1_val);
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x11 = vmovl_s16(input1_val_low);
+    int32x4_t x12 = vmovl_s16(input1_val_high);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s =
+        vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped = vmax_u8(output_activation_min_vector,
+                                      vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif // NEON
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                   raw_sum, params.output_multiplier, params.output_shift) +
+                               params.output_offset;
+    const int32_t clamped_output = std::min(params.quantized_activation_max,
+                                            std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+struct BinaryOpFuncAddFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vaddq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a + b; }
+};
+
+struct BinaryOpFuncSubFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vsubq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a - b; }
+};
+
+struct BinaryOpFuncMulFloat
+{
+#ifdef USE_NEON
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vmulq_f32(a, b);
+  }
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a * b; }
+};
+
+struct BinaryOpFuncDivFloat
+{
+#ifdef USE_NEON
+#ifdef __aarch64__
+  static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+  {
+    return vdivq_f32(a, b);
+  }
+#endif // __aarch64__
+#endif // USE_NEON
+  static inline float calculate(const float a, const float b) { return a / b; }
+};
+
+template <class BASEOPERATOR> struct BinaryOpFuncSwapArgs
+{
+  template <typename T> static inline T calculate(const T &a, const T &b)
+  {
+    return BASEOPERATOR::calculate(b, a);
+  }
+};
+
+struct BinaryOpActivationFloatNone
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    (void)ceilingParam; // suppress unused argument warning
+    return value;
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    (void)floorParam;
+    return value;
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    (void)ceilingParam;
+    return value;
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    (void)floorParam;
+    return value;
+  }
+};
+
+struct BinaryOpActivationFloatMax
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    (void)ceilingParam; // suppress unused argument warning
+    return value;
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    return vmaxq_f32(value, floorParam);
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    (void)ceilingParam;
+    return value;
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    return std::max(value, floorParam);
+  }
+};
+
+struct BinaryOpActivationFloatMinMax
+{
+#ifdef USE_NEON
+  static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+  {
+    return vminq_f32(value, ceilingParam);
+  }
+  static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+  {
+    return vmaxq_f32(value, floorParam);
+  }
+#endif // USE_NEON
+  static inline float applyCeiling(const float value, const float ceilingParam)
+  {
+    return std::min(value, ceilingParam);
+  }
+  static inline float applyFloor(const float value, const float floorParam)
+  {
+    return std::max(value, floorParam);
+  }
+};
+
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
+                                const float *input1_data, const float *input2_data,
+                                float *output_data)
+{
+  int i = 0;
+
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(params.float_activation_min);
+  const auto activation_max = vdupq_n_f32(params.float_activation_max);
+  for (; i <= size - 16; i += 16)
+  {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = OPERATOR::calculate(a10, a20);
+    auto x1 = OPERATOR::calculate(a11, a21);
+    auto x2 = OPERATOR::calculate(a12, a22);
+    auto x3 = OPERATOR::calculate(a13, a23);
+    x0 = ACTIVATION::applyFloor(x0, activation_min);
+    x1 = ACTIVATION::applyFloor(x1, activation_min);
+    x2 = ACTIVATION::applyFloor(x2, activation_min);
+    x3 = ACTIVATION::applyFloor(x3, activation_min);
+    x0 = ACTIVATION::applyCeiling(x0, activation_max);
+    x1 = ACTIVATION::applyCeiling(x1, activation_max);
+    x2 = ACTIVATION::applyCeiling(x2, activation_max);
+    x3 = ACTIVATION::applyCeiling(x3, activation_max);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4)
+  {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = OPERATOR::calculate(a1, a2); // vaddq
+    auto x_clamped =
+        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+    vst1q_f32(output_data + i, x_clamped);
+  }
+#endif // USE_NEON
+  for (; i < size; i++)
+  {
+    auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
+    output_data[i] = ACTIVATION::applyCeiling(
+        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+  }
+}
+
+// Broadcast binary op template that can often be used for inner loop
+// This function will handle scalar_value (LHS) and vector_values (RHS).
+// Since it's a float function, input params does not matter here.
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+                                    const float broadcast_value, const float *input2_data,
+                                    float *output_data)
+{
+  int i = 0;
+
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(params.float_activation_min);
+  const auto activation_max = vdupq_n_f32(params.float_activation_max);
+  const auto broadcast_value_dup = vdupq_n_f32(broadcast_value);
+  for (; i <= size - 16; i += 16)
+  {
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = OPERATOR::calculate(broadcast_value_dup, a20);
+    auto x1 = OPERATOR::calculate(broadcast_value_dup, a21);
+    auto x2 = OPERATOR::calculate(broadcast_value_dup, a22);
+    auto x3 = OPERATOR::calculate(broadcast_value_dup, a23);
+    x0 = ACTIVATION::applyFloor(x0, activation_min);
+    x1 = ACTIVATION::applyFloor(x1, activation_min);
+    x2 = ACTIVATION::applyFloor(x2, activation_min);
+    x3 = ACTIVATION::applyFloor(x3, activation_min);
+    x0 = ACTIVATION::applyCeiling(x0, activation_max);
+    x1 = ACTIVATION::applyCeiling(x1, activation_max);
+    x2 = ACTIVATION::applyCeiling(x2, activation_max);
+    x3 = ACTIVATION::applyCeiling(x3, activation_max);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4)
+  {
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = OPERATOR::calculate(broadcast_value_dup, a2);
+    auto x_clamped =
+        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+    vst1q_f32(output_data + i, x_clamped);
+  }
+#endif // USE_NEON
+  for (; i < size; i++)
+  {
+    auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
+    output_data[i] = ACTIVATION::applyCeiling(
+        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+  }
+}
+
+using BinaryOpImplFloatFuncs =
+    std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
+              void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
+
+template <class FUNC>
+inline BinaryOpImplFloatFuncs
+getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam &params)
+{
+  if (params.float_activation_max == std::numeric_limits<float>::max())
+    if (params.float_activation_min == std::numeric_limits<float>::lowest())
+      return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>,
+                                    BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>);
+    else
+      return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>,
+                                    BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>);
+  else
+    return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>,
+                                  BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
+}
+
+inline void AddQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                      const uint8_t *input1_data, const Shape &input2_shape,
+                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+{
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                const float *input1_data, const Shape &input2_shape, const float *input2_data,
+                const Shape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
+                                     uint8_t broadcast_value, const uint8_t *input2_data,
+                                     uint8_t *output_data)
+{
+  int i = 0;
+  int32_t clamped_output;
+  for (; i < size; ++i)
+  {
+    clamped_output = quant8_sum(params, broadcast_value, input2_data[i]);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
+                                       const Shape &input1_shape, const uint8_t *input1_data,
+                                       const Shape &input2_shape, const uint8_t *input2_data,
+                                       const Shape &output_shape, uint8_t *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  {
+    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
+        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
+                const uint8_t &b) -> uint8_t {
+      return static_cast<uint8_t>(quant8_sum(params, a, b));
+    };
+    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
+                                                     input2_shape, input2_data, output_shape,
+                                                     output_data, fn);
+  }
+  else
+  {
+    BinaryBroadcastFiveFold(
+        params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+        input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+        static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
+                             uint8_t *)>(AddElementwiseQuant8),
+        static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
+                             uint8_t *)>(AddScalarBroadcastQuant8));
+  }
+}
+
+inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  {
+    const std::function<float(const float &, const float &)> fn =
+        [](const float &a, const float &b) -> float { return a + b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+  }
+  else
+  {
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+
+    BinaryBroadcastFiveFold(params, params.broadcast_category ==
+                                        BroadcastableOpCategory::kSecondInputBroadcastsFast,
+                            input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                            output_data, implFuncs.first, implFuncs.second);
+  }
+}
+
+inline void Sub(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                const float *input1_data, const Shape &input2_shape, const float *input2_data,
+                const Shape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
+  {
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+    BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
+  {
+    auto implFuncs =
+        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
+    BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else
+  {
+    const std::function<float(const float &, const float &)> fn =
+        [](const float &a, const float &b) -> float { return a - b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+  }
+}
+
+inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
+                          const uint8_t input2_data)
+{
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t input2_val = params.input2_offset + input2_data;
+  const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+  const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+
+  return clamped_output;
+}
+
+inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
+                                 const uint8_t *input1_data, const uint8_t *input2_data,
+                                 uint8_t *output_data)
+{
+  int i = 0;
+
+#ifdef USE_NEON
+  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdup_n_u8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 8; i += 8)
+  {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+
+    auto p1 = vmull_s16(input2_val_low, input1_val_low);
+    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped = vmax_u8(output_activation_min_vector,
+                                 vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                             params.output_multiplier,
+                                                             params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+inline void MulQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                      const uint8_t *input1_data, const Shape &input2_shape,
+                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+{
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                const float *input1_data, const Shape &input2_shape, const float *input2_data,
+                const Shape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
+                                     const uint8_t broadcast_value, const uint8_t *input2_data,
+                                     uint8_t *output_data)
+{
+  int i = 0;
+  int32_t clamped_output;
+  for (; i < size; ++i)
+  {
+    clamped_output = quant8_mul(params, broadcast_value, input2_data[i]);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
+                                       const Shape &input1_shape, const uint8_t *input1_data,
+                                       const Shape &input2_shape, const uint8_t *input2_data,
+                                       const Shape &output_shape, uint8_t *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  {
+    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
+        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
+                const uint8_t &b) -> uint8_t {
+      return static_cast<uint8_t>(quant8_mul(params, a, b));
+    };
+    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
+                                                     input2_shape, input2_data, output_shape,
+                                                     output_data, fn);
+    return;
+  }
+  BinaryBroadcastFiveFold(
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
+                           uint8_t *)>(MulElementwiseQuant8),
+      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
+                           uint8_t *)>(MulSimpleBroadcastQuant8));
+}
+
+inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  {
+    // TODO: Use GetBinaryArithmeticFn
+    const std::function<float(const float &, const float &)> fn =
+        [](const float &a, const float &b) -> float { return a * b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+    return;
+  }
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+  BinaryBroadcastFiveFold(params, params.broadcast_category ==
+                                      BroadcastableOpCategory::kSecondInputBroadcastsFast,
+                          input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                          output_data, implFuncs.first, implFuncs.second);
+}
+
+inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                const float *input1_data, const Shape &input2_shape, const float *input2_data,
+                const Shape &output_shape, float *output_data)
+{
+#ifdef __aarch64__
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+  (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+#else
+  const std::function<float(const float &, const float &)> fn =
+      [](const float &a, const float &b) -> float { return a / b; };
+  reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
+                                output_shape, output_data, fn);
+#endif // __aarch64__
+}
+
+inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                                 const float *input1_data, const Shape &input2_shape,
+                                 const float *input2_data, const Shape &output_shape,
+                                 float *output_data)
+{
+#ifdef __aarch64__
+  if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
+  {
+    auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+    BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
+  {
+    auto implFuncs =
+        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
+    BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+                            output_shape, output_data, implFuncs.first, implFuncs.second);
+  }
+  else
+#endif // __aarch64__
+  {
+    const std::function<float(const float &, const float &)> fn =
+        [](const float &a, const float &b) -> float { return a / b; };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+  }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__
diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h
new file mode 100644
index 000000000..0f620146c
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/Conv.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_CONV_H__
+#define __NNFW_CKER_OPTIMIZED_CONV_H__
+
+#include "OptimizedUtils.h"
+
+#include "cker/eigen/EigenSupport.h"
+#include "cker/eigen/Utils.h"
+#include "cker/gemmlowp/GEMMSupport.h"
+#include "cker/neon/neon_check.h"
+#include "cker/operation/Common.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <public/gemmlowp.h>
+#include <public/map.h>
+#include <fixedpoint/fixedpoint.h>
+
+#include <vector>
+#include <tuple>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+struct GemmlowpOutputPipeline
+{
+  typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
+                          int32_t output_multiplier, int output_left_shift,
+                          int32_t output_activation_min, int32_t output_activation_max)
+  {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage,
+                           saturating_cast_stage);
+  }
+};
+
+inline void AddBiasAndEvalActivationFunction(float output_activation_min,
+                                             float output_activation_max, const Shape &bias_shape,
+                                             const float *bias_data, const Shape &array_shape,
+                                             float *array_data)
+{
+  BiasAndClamp(output_activation_min, output_activation_max, bias_shape.FlatSize(), bias_data,
+               array_shape.FlatSize(), array_data);
+}
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+                 const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape,
+                 const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data,
+                 const Shape &im2col_shape, uint8_t *im2col_data)
+{
+  gemmlowp::GemmContext *gemm_context = gemm_support::GetGemmLowpContext();
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const uint8_t *gemm_input_data = nullptr;
+  const Shape *gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+  const bool need_im2col =
+      stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+  if (need_dilated_im2col)
+  {
+    assert(im2col_data);
+    const int input_zero_point = -input_offset;
+    assert(input_zero_point >= 0);
+    assert(input_zero_point <= 255);
+    DilatedIm2col(params, input_zero_point, input_shape, input_data, filter_shape, output_shape,
+                  im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  }
+  else if (need_im2col)
+  {
+    assert(im2col_data);
+    const int input_zero_point = -input_offset;
+    assert(input_zero_point >= 0);
+    assert(input_zero_point <= 255);
+    Im2col(params, filter_height, filter_width, input_zero_point, input_shape, input_data,
+           im2col_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  }
+  else
+  {
+    gemm_input_data = input_data;
+    gemm_input_shape = &input_shape;
+  }
+
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
+  // The root cause has not yet been identified though. Same applies below for
+  // the other calls commented out. This is a partial rollback of cl/196819423.
+  // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int gemm_input_cols =
+      gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
+  const int filter_rows = filter_shape.Dims(0);
+  // See b/79927784.
+  // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
+  const int filter_cols = filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
+  const int output_rows = output_shape.Dims(3);
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_shape, 3);
+  const int output_cols = output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
+  assert(output_rows == filter_rows);
+  assert(output_cols == gemm_input_cols);
+  assert(filter_cols == gemm_input_rows);
+  assert(bias_shape.FlatSize() == output_rows);
+  UNUSED_RELEASE(bias_shape);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
+                                                                           output_cols);
+  const auto &output_pipeline =
+      GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
+                                      output_shift, output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
+      output_pipeline);
+}
+
+} // namespace optimized
+
+namespace multithreaded
+{
+namespace
+{
+template <class T> class EigenTensorConvFunctor
+{
+private:
+  Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding)
+  {
+    switch (padding)
+    {
+      case PaddingType::kValid:
+        return Eigen::PADDING_VALID;
+      case PaddingType::kSame:
+        return Eigen::PADDING_SAME;
+      case PaddingType::kNone:
+        assert(false); // should never get here.
+        return Eigen::PADDING_VALID;
+    }
+    return Eigen::PADDING_SAME; // Prevent compiler warning about missing
+                                // return
+  }
+
+public:
+  void operator()(const Eigen::ThreadPoolDevice &device, const T *input_data, int input_batches,
+                  int input_height, int input_width, int input_depth, const T *filter_data,
+                  int filter_height, int filter_width, int filter_count, int stride_rows,
+                  int stride_cols, int pad_height, int pad_width, nnfw::cker::PaddingType padding,
+                  T *output_data, int output_height, int output_width)
+  {
+    const bool is_1x1_kernel =
+        (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
+    const bool is_same_height_width =
+        (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
+         pad_height == 0);
+    if (is_1x1_kernel || is_same_height_width)
+    {
+      // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
+      //  - output (input_batches * conv_width, filter_count)
+      //  - input (input_batches * conv_width, input_depth)
+      //  - filter (input_depth, filter_count)
+      // is_same_height_width: If the input data and filter have the same height/width, the 2D
+      // convolution is reduced to matrix multiplication.
+      //  - output (input_batches, filter_count)
+      //  - input (input_batches, filter_width * filter_height * input_depth)
+      //  - filter (filter_width * filter_height * input_depth, filter_count)
+      const int conv_width = output_height * output_width;
+      int io_col = input_batches;
+      int filter_col = input_depth * filter_width * filter_height;
+      if (is_1x1_kernel)
+      {
+        io_col *= conv_width;
+      }
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      eigen_support::EigenMatrix output(output_data, io_col, filter_count);
+      eigen_support::ConstEigenMatrix input(input_data, io_col, filter_col);
+      eigen_support::ConstEigenMatrix filter(filter_data, filter_col, filter_count);
+      eigen_support::MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input, filter,
+                                                                     dim_pair);
+    }
+    else
+    {
+      eigen_support::EigenTensor output(output_data, input_batches, output_height, output_width,
+                                        filter_count);
+      eigen_support::ConstEigenTensor input(input_data, input_batches, input_height, input_width,
+                                            input_depth);
+      eigen_support::ConstEigenTensor filter(filter_data, filter_height, filter_width, input_depth,
+                                             filter_count);
+      output.device(device) = Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
+                                                        RuntimePadding2EigenPadding(padding));
+    }
+  }
+};
+} // namespace
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                 const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                 const float *bias_data, const Shape &output_shape, float *output_data)
+{
+  const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const PaddingType padding = params.padding_type;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  EigenTensorConvFunctor<float> conv_functor;
+  conv_functor(device, input_data, batches, input_height, input_width, input_depth, filter_data,
+               filter_height, filter_width, output_depth, stride_height, stride_width, pad_height,
+               pad_width, padding, output_data, output_height, output_width);
+
+  optimized::AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
+                                              bias_shape, bias_data, output_shape, output_data);
+}
+
+} // namespace multithreaded
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_CONV_H__
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
new file mode 100644
index 000000000..d383b126d
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
@@ -0,0 +1,2123 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+
+#include <fixedpoint/fixedpoint.h>
+#include <public/gemmlowp.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    uint8x8x2_t filter_u8;
+    filter_u8.val[0] = vld1_u8(filter_ptr);
+    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[0].val[i] =
+            vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] =
+            vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] =
+            vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const uint8_t dup3_indices_array[3][8] = {
+        {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+    uint8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++)
+    {
+      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const uint8_t *local_filter_ptr = filter_ptr;
+      const uint8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[3];
+        uint8x8x3_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++)
+        {
+          const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+
+        uint8x8_t input_u8_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++)
+        {
+          acc[0].val[j] =
+              vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+          acc[1].val[j] =
+              vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++)
+        {
+          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const uint8_t *local_filter_ptr = filter_ptr;
+      const uint8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[2];
+        uint8x8x2_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++)
+        {
+          const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++)
+        {
+          acc[0].val[j] =
+              vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] =
+              vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs.
+        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++)
+        {
+          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const uint8_t *local_filter_ptr = filter_ptr;
+      const uint8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters, add filter_offset.
+        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
+        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
+        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters, add filter_offset.
+        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+        const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
+        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
+    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
+    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
+    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_u8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_u8_x are the same
+    // as the last 4 bytes of filter_u8_x.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter =
+        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      uint8_t input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint16x4_t input_u16 = vdup_n_u16(0);
+      input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0);
+      input_ptr += input_ptr_increment;
+      input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    if (num_output_pixels <= 0)
+    {
+      return;
+    }
+
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    uint8x8_t input_u8 = vdup_n_u8(0);
+    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+    const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
+    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
+      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
+                                    int input_width, const uint8_t *input_data,
+                                    int16_t input_offset, int pad_width, int depth_multiplier,
+                                    int filter_width, const uint8_t *filter_data,
+                                    int16_t filter_offset, int out_x_buffer_start,
+                                    int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
+{
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const uint8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+        num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+        input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                                  int input_width, const uint8_t *input_data,
+                                                  int16_t input_offset, int pad_width,
+                                                  int depth_multiplier, int filter_width,
+                                                  const uint8_t *filter_data, int16_t filter_offset,
+                                                  int out_x_buffer_start, int out_x_buffer_end,
+                                                  int output_depth, int32_t *acc_buffer)
+{
+  const uint8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const uint8_t *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const int16_t input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const int16_t filter_val = *filter_ptr++ + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32_t *bias_data, int32_t *acc_buffer)
+{
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1)
+  {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16)
+    {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  }
+  else if (output_depth == 2)
+  {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8)
+    {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  }
+  else if (output_depth == 4)
+  {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  }
+  else if (output_depth == 8)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  }
+  else if (output_depth == 16)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape,
+                                 const uint8_t *input_data, const Shape &filter_shape,
+                                 const uint8_t *filter_data, const Shape &bias_shape,
+                                 const int32_t *bias_data, const Shape &output_shape,
+                                 uint8_t *output_data)
+{
+  (void)bias_shape;
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+#ifdef USE_NEON
+  const bool shift_left = (output_shift > 0);
+  const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
+#endif
+
+  static const int kAccBufferMaxSize = 2048;
+  int32_t acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+  UNUSED_RELEASE(kAccBufferActualSize);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  uint8_t *output_ptr = output_data;
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                      dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+            std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         input_offset, pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, filter_offset,
+                         out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Now need to convert them to
+        // the final 8bit form and store them.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+#ifdef USE_NEON
+        using gemmlowp::RoundingDivideByPOT;
+        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+        const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
+        const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
+        // Handle 16 values at once.
+        // This allows us to issue 4 mutually independent int32
+        // multiplications (vqrdmulh), which should alleviate most of their
+        // high latency.
+        for (; i <= num_output_values - 16; i += 16)
+        {
+          int32x4_t acc[4];
+          for (int j = 0; j < 4; j++)
+          {
+            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
+          }
+
+          if (!shift_left)
+          {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++)
+            {
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+            for (int j = 0; j < 4; j++)
+            {
+              acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
+            }
+          }
+          else
+          {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++)
+            {
+              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+          }
+          // Add the output offset.
+          for (int j = 0; j < 4; j++)
+          {
+            acc[j] = vaddq_s32(acc[j], output_offset_vec);
+          }
+          // Apply the activation function.
+          for (int j = 0; j < 4; j++)
+          {
+            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
+          }
+          for (int j = 0; j < 4; j++)
+          {
+            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
+          }
+          // Saturating cast to uint8_t and store to destination.
+          int16x4_t acc_s16[4];
+          for (int j = 0; j < 4; j++)
+          {
+            acc_s16[j] = vqmovn_s32(acc[j]);
+          }
+          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
+          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
+          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
+          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
+          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+          output_ptr += 16;
+        }
+        // Handle 8 values at once.
+        // Not as good as 16 (now we're only issuing 2 mutually independent
+        // vqrdmulh instructions, so we're probably paying for their high
+        // latency).
+        for (; i <= num_output_values - 8; i += 8)
+        {
+          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
+          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
+          if (!shift_left)
+          {
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            // Rounding right shift.
+            acc0 = RoundingDivideByPOT(acc0, -output_shift);
+            acc1 = RoundingDivideByPOT(acc1, -output_shift);
+          }
+          else
+          {
+            // Fixed-point multiplication.
+            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+
+            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          }
+          // Add the output offset.
+          acc0 = vaddq_s32(acc0, output_offset_vec);
+          acc1 = vaddq_s32(acc1, output_offset_vec);
+          // Apply the activation function.
+          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
+          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
+          acc0 = vminq_s32(acc0, output_activation_max_vec);
+          acc1 = vminq_s32(acc1, output_activation_max_vec);
+          // Saturating cast to uint8_t and store to destination.
+          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
+          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
+          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_u8(output_ptr, res_u8);
+          output_ptr += 8;
+        }
+        // Handle 4 values at once. Now we're paying the full price of the
+        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
+        // (without any alignment) can only be done 1 byte at a time.
+        // Yet, that is still worth doing to minimize the amount of leftover
+        // that will have to go through the very slow scalar code.
+        for (; i <= num_output_values - 4; i += 4)
+        {
+          int32x4_t acc = vld1q_s32(acc_buffer + i);
+          if (!shift_left)
+          {
+            // Fixed-point multiplication.
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+            // Rounding right shift.
+            acc = RoundingDivideByPOT(acc, -output_shift);
+          }
+          else
+          {
+            // Fixed-point multiplication.
+            acc = vmulq_n_s32(acc, multiplier_power_of_two);
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+          }
+          // Add the output offset.
+          acc = vaddq_s32(acc, output_offset_vec);
+          // Apply the activation function.
+          acc = vmaxq_s32(acc, output_activation_min_vec);
+          acc = vminq_s32(acc, output_activation_max_vec);
+          // Saturating cast to uint8_t and store to destination.
+          const int16x4_t acc_s16 = vqmovn_s32(acc);
+          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_lane_u8(output_ptr + 0, res_u8, 0);
+          vst1_lane_u8(output_ptr + 1, res_u8, 1);
+          vst1_lane_u8(output_ptr + 2, res_u8, 2);
+          vst1_lane_u8(output_ptr + 3, res_u8, 3);
+          output_ptr += 4;
+        }
+#endif // USE_NEON
+
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++)
+        {
+          int32_t acc = acc_buffer[i];
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          *output_ptr++ = static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
new file mode 100644
index 000000000..ae1f9e78e
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_OPTIMIZED_UTILS_H__
+#define __NNFW_CKER_OPTIMIZED_OPTIMIZED_UTILS_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+
+#include <stdexcept>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, int b, int kheight,
+                                         int kwidth, int stride_width, int stride_height,
+                                         int pad_width, int pad_height, int in_width, int in_height,
+                                         int in_depth, int single_buffer_length, int buffer_id,
+                                         const T *in_data, T *conv_buffer_data, uint8_t zero_byte)
+{
+  assert(input_shape.DimensionsCount() == 4);
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num = std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num == ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0)
+  {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0))
+  {
+    for (int ih = ih_start; ih < ih_end; ++ih)
+    {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+  else
+  {
+    for (int ih = ih_start; ih < ih_end; ++ih)
+    {
+      if (left_padding > 0)
+      {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
+      if (right_padding > 0)
+      {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0)
+  {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+        output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
+  }
+}
+
+// Supports per-batch zero_byte for per-batch asymmetric quantized inputs.
+template <typename T>
+void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *input_data,
+                   const Shape &filter_shape, const Shape &output_shape, T *im2col_data,
+                   const int32_t *zero_bytes, const int zero_bytes_len)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same optimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  assert(dilation_width_factor != 1 || dilation_height_factor != 1);
+  assert(im2col_data);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  MatchingDim(output_shape, 3, filter_shape, 0);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  const Shape row_shape({1, batches, output_height, output_width});
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  const Shape col_shape({1, filter_height, filter_width, input_depth});
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  const Shape im2col_shape({1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    const T zero_byte =
+        zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        // Each im2col row is an output pixel. Arrange the input data in this
+        // row in an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height))
+          {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
+              T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
+              if ((in_x >= 0) && (in_x < input_width))
+              {
+                // Filter pixel is within the input, copy the input data.
+                T const *src = input_data + Offset(input_shape, batch, in_y, in_x, 0);
+                memcpy(dst, src, input_depth * sizeof(T));
+              }
+              else
+              {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, zero_byte, input_depth * sizeof(T));
+              }
+            }
+          }
+          else
+          {
+            // Filter row is outside the input, zero out the entire filter row.
+            int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
+            T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
+            memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void DilatedIm2col(const ConvParams &params, uint8_t zero_byte, const Shape &input_shape,
+                   const T *input_data, const Shape &filter_shape, const Shape &output_shape,
+                   T *im2col_data)
+{
+  const int32_t zero_point = static_cast<int32_t>(zero_byte);
+  DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape, im2col_data,
+                   &zero_point, 1);
+}
+
+template <typename T>
+void Im2col(const ConvParams &params, int kheight, int kwidth, uint8_t zero_byte,
+            const Shape &input_shape, const T *input_data, const Shape &output_shape,
+            T *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int h = 0; h < output_height; ++h)
+    {
+      for (int w = 0; w < output_width; ++w)
+      {
+        ExtractPatchIntoBufferColumn(input_shape, w, h, b, kheight, kwidth, stride_width,
+                                     stride_height, pad_width, pad_height, input_width,
+                                     input_height, input_depth, output_depth, buffer_id, input_data,
+                                     output_data, zero_byte);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_OPTIMIZED_UTILS_H__
diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h
new file mode 100644
index 000000000..e8ffd4014
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_BATCH_MATMUL_H__
+#define __NNFW_CKER_REFERENCE_BATCH_MATMUL_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Shape &rhs_shape,
+                        const float *rhs_data, const Shape &, float *output_data)
+{
+  const Shape extended_lhs_shape = Shape::ExtendedShape(5, lhs_shape);
+  const Shape extended_rhs_shape = Shape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim)
+      return lhs_dim;
+    if (lhs_dim == 1)
+      return rhs_dim;
+    assert(rhs_dim == 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const Shape &shape, int x) {
+    if (shape.Dims(x) == 1)
+    {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i)
+    {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 = broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 = broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 = broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0)
+  {
+    const float *lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const float *rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1)
+    {
+      const float *lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const float *rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2)
+      {
+        const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        float *out_ptr =
+            output_data +
+            ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols;
+        for (int j = 0; j < rhs_cols; ++j)
+        {
+          for (int i = 0; i < lhs_rows; ++i)
+          {
+            float total = 0.f;
+            for (int k = 0; k < accum_depth; ++k)
+            {
+              total += lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
+            }
+            int idx = lhs_rows * j + i;
+            out_ptr[idx] = total;
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_BATCH_MATMUL_H__
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
new file mode 100644
index 000000000..f7e39248c
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__
+#define __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+template <typename T>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                               const T *input1_data, const Shape &input2_shape,
+                               const T *input2_data, const Shape &output_shape, T *output_data,
+                               const std::function<T(const T &, const T &)> &fn)
+{
+  const int32_t flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
+                                                  params.quantized_activation_min,
+                                                  params.quantized_activation_max);
+  }
+}
+
+template <>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                               const float *input1_data, const Shape &input2_shape,
+                               const float *input2_data, const Shape &output_shape,
+                               float *output_data,
+                               const std::function<float(const float &, const float &)> &fn)
+{
+  const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < size; i++)
+  {
+    output_data[i] =
+        ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
+                                     params.float_activation_min, params.float_activation_max);
+  }
+}
+
+template <typename T>
+inline void BroadcastBinaryArithmeticOpSlowQuant8(
+    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
+    const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255))
+  {
+    throw std::runtime_error{"Support only for Quant8."};
+  }
+
+  // Comment from tensorflow lite:
+  //
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax<uint8_t>(
+                  fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                     input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+                  params.quantized_activation_min, params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+template <typename T>
+inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &params,
+                                            const Shape &input1_shape, const T *input1_data,
+                                            const Shape &input2_shape, const T *input2_data,
+                                            const Shape &output_shape, T *output_data,
+                                            const std::function<T(const T &, const T &)> &fn)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  // Comment from tensorflow lite:
+  //
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
+              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+              params.quantized_activation_min, params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+template <>
+inline void BroadcastBinaryArithmeticOpSlow(
+    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+    const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+    float *output_data, const std::function<float(const float &, const float &)> &fn)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
+              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+              params.float_activation_min, params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
new file mode 100644
index 000000000..86e8b5143
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_CONV_H__
+#define __NNFW_CKER_REFERENCE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                 const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                 const float *bias_data, const Shape &output_shape, float *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  UNUSED_RELEASE(bias_shape);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    assert(bias_shape.FlatSize() == output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              // If the location is outside the bounds of the input image,
+              // use zero as a default value.
+              if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+              {
+                const int in_offset = Offset(input_shape, batch, in_y, in_x, 0);
+                const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+                for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+                {
+                  float input_value = input_data[in_offset + in_channel];
+                  float filter_value = filter_data[filter_offset + in_channel];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+          }
+          float bias_value = 0.0f;
+          if (bias_data)
+          {
+            bias_value = bias_data[out_channel];
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+                 const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape,
+                 const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  assert(output_activation_min <= output_activation_max);
+
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  UNUSED_RELEASE(bias_shape);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    assert(bias_shape.FlatSize() == output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              // If the location is outside the bounds of the input image,
+              // use zero as a default value.
+              if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+              {
+                const int in_base = Offset(input_shape, batch, in_y, in_x, 0);
+                const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+                for (int in_channel = 0; in_channel < input_depth; in_channel++)
+                {
+                  int32_t input_val = input_data[in_channel + in_base];
+                  int32_t filter_val = filter_data[in_channel + filter_base];
+                  acc += (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+          }
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_CONV_H__
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
new file mode 100644
index 000000000..7b4ff2040
--- /dev/null
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RUY_RUY_SUPPORT_H__
+#define __NNFW_CKER_RUY_RUY_SUPPORT_H__
+
+#include <util/ConfigSource.h>
+#include <ruy/matrix.h>
+#include <ruy/ruy.h>
+#include <cassert>
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace ruy_support
+{
+
+inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy)
+{
+  switch (cache_policy)
+  {
+    case CachePolicy::kNeverCache:
+      return ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ruy::CachePolicy::kAlwaysCache;
+    default:
+      assert(false);
+      return ruy::CachePolicy::kNeverCache;
+  }
+}
+
+template <typename Scalar, typename DataPointer>
+void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
+                   ruy::Matrix<Scalar> *dst, bool use_caching = false)
+{
+  ruy::Order ruy_order =
+      params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
+  ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
+  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+  // It does care whether we assign to it a Scalar* or a const Scalar*.
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+  if (use_caching)
+  {
+    dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
+  }
+}
+
+template <typename GemmParamsType, typename RuySpecType>
+void MakeRuyMulParams(const GemmParamsType &params, RuySpecType *ruy_mul_params)
+{
+  // This validation has already been performed by the Gemm API entry point,
+  // but it doesn't hurt to test specifically this again here, where it's
+  // being used.
+  ValidateGemmParams(params);
+
+  ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+  ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+  ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel);
+  ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel);
+  ruy_mul_params->set_bias(params.bias);
+  ruy_mul_params->set_clamp_min(params.clamp_min);
+  ruy_mul_params->set_clamp_max(params.clamp_max);
+}
+
+} // namespace ruy_support
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RUY_RUY_SUPPORT_H__
diff --git a/compute/test/CMakeLists.txt b/compute/test/CMakeLists.txt
new file mode 100644
index 000000000..92aac3e72
--- /dev/null
+++ b/compute/test/CMakeLists.txt
@@ -0,0 +1,17 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+set(TEST_COMPUTE test_compute)
+
+file(GLOB_RECURSE TESTS "*.cc")
+
+add_executable(${TEST_COMPUTE} ${TESTS})
+
+target_link_libraries(${TEST_COMPUTE} nnfw_lib_cker)
+target_link_libraries(${TEST_COMPUTE} gtest)
+target_link_libraries(${TEST_COMPUTE} gtest_main)
+target_link_libraries(${TEST_COMPUTE} ${LIB_PTHREAD} dl)
+add_test(${TEST_COMPUTE} ${TEST_COMPUTE})
+
+install(TARGETS ${TEST_COMPUTE} DESTINATION unittest_standalone)
diff --git a/compute/test/cker/Range.cc b/compute/test/cker/Range.cc
new file mode 100644
index 000000000..55f4fcf20
--- /dev/null
+++ b/compute/test/cker/Range.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/operation/Range.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, Range)
+{
+  {
+    const int start = 0;
+    const int limit = 10;
+    const int delta = 1;
+    std::vector<int> actual(10);
+    nnfw::cker::Range<int>(&start, &limit, &delta, actual.data());
+
+    for (int i = 0; i < actual.size(); i++)
+      ASSERT_EQ(actual[i], i);
+  }
+
+  {
+    const int start = 3;
+    const int limit = 18;
+    const int delta = 3;
+    std::vector<int> expected = {3, 6, 9, 12, 15};
+    std::vector<int> actual(expected.size());
+    nnfw::cker::Range<int>(&start, &limit, &delta, actual.data());
+
+    for (int i = 0; i < actual.size(); i++)
+      ASSERT_EQ(actual[i], expected[i]);
+  }
+
+  {
+    const float start = 3;
+    const float limit = 1;
+    const float delta = -0.5;
+    std::vector<float> expected = {
+        3, 2.5, 2, 1.5,
+    };
+    std::vector<float> actual(expected.size());
+    nnfw::cker::Range<float>(&start, &limit, &delta, actual.data());
+
+    for (int i = 0; i < actual.size(); i++)
+      ASSERT_FLOAT_EQ(actual[i], expected[i]);
+  }
+}
+
+TEST(CKer_Operation, neg_Range)
+{
+  {
+    const int start = 212;
+    const int limit = 10;
+    const int delta = 1;
+    std::vector<int> actual(10);
+
+    EXPECT_ANY_THROW(nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()));
+  }
+}
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-12-14 14:43:04 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-12-14 14:43:04 +0900
commit	12d88feea8573f8490629cf62fc342b152e57d65 (patch)
tree	3c734cc4d629834d2d523f4575ef84cd64684e57 /compute
parent	d6b371e095d737922187a518b8faba1ef6f3a2b1 (diff)
download	nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.gz nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.bz2 nnfw-12d88feea8573f8490629cf62fc342b152e57d65.zip