142 files changed, 9842 insertions, 6896 deletions
diff --git a/libs/.FORMATCHECKED b/libs/.FORMATCHECKED
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/libs/.FORMATCHECKED
diff --git a/libs/ARMComputeEx/CMakeLists.txt b/libs/ARMComputeEx/CMakeLists.txt
new file mode 100644
index 000000000..2483fb55d
--- /dev/null
+++ b/libs/ARMComputeEx/CMakeLists.txt
@@ -0,0 +1,21 @@
+if("${TARGET_ARCH}" STREQUAL "x86_64")
+  return()
+endif()
+
+nnfw_find_package(ARMCompute REQUIRED)
+
+set(ACL_EX_BASE ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx)
+
+file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
+
+# generate embeded cl_kernel
+execute_process (
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/libs/ARMComputeEx"
+    COMMAND bash -c "python resolve_includes.py"
+)
+
+add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
+set_target_properties(arm_compute_ex PROPERTIES COMPILE_FLAGS "-DEMBEDDED_KERNELS=1")
+target_include_directories(arm_compute_ex PUBLIC ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx)
+target_link_libraries(arm_compute_ex arm_compute_core)
+install(TARGETS arm_compute_ex DESTINATION lib)
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
new file mode 100644
index 000000000..026487077
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/** CLKernelLibrary class */
+class CLKernelLibraryEx
+{
+  using StringSet = std::set<std::string>;
+
+private:
+  /** Default Constructor. */
+  CLKernelLibraryEx();
+
+public:
+  /** Prevent instances of this class from being copied */
+  CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
+  /** Prevent instances of this class from being copied */
+  const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
+  /** Access the KernelLibrary singleton.
+   * @return The KernelLibrary instance.
+   */
+  static CLKernelLibraryEx &get();
+  /** Initialises the kernel library.
+   *
+   * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
+   * @param[in] context     (Optional) CL context used to create programs.
+   * @param[in] device      (Optional) CL device for which the programs are created.
+   */
+  void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(),
+            cl::Device device = cl::Device::getDefault())
+  {
+    _kernel_path = std::move(kernel_path);
+    _context = std::move(context);
+    _device = std::move(device);
+  }
+  /** Sets the path that the kernels reside in.
+   *
+   * @param[in] kernel_path Path of the kernel.
+   */
+  void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
+  /** Gets the path that the kernels reside in.
+   */
+  std::string get_kernel_path() { return _kernel_path; };
+  /** Gets the source of the selected program.
+   *
+   * @param[in] program_name Program name.
+   *
+   * @return Source of the selected program.
+   */
+  std::string get_program_source(const std::string &program_name);
+  /** Sets the CL context used to create programs.
+   *
+   * @note Setting the context also resets the device to the
+   *       first one available in the new context.
+   *
+   * @param[in] context A CL context.
+   */
+  void set_context(cl::Context context)
+  {
+    _context = std::move(context);
+    if (_context.get() == nullptr)
+    {
+      _device = cl::Device();
+    }
+    else
+    {
+      const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+      if (cl_devices.empty())
+      {
+        _device = cl::Device();
+      }
+      else
+      {
+        _device = cl_devices[0];
+      }
+    }
+  }
+
+  /** Accessor for the associated CL context.
+   *
+   * @return A CL context.
+   */
+  cl::Context &context() { return _context; }
+
+  /** Sets the CL device for which the programs are created.
+   *
+   * @param[in] device A CL device.
+   */
+  void set_device(cl::Device device) { _device = std::move(device); }
+
+  /** Return the device version
+   *
+   * @return The content of CL_DEVICE_VERSION
+   */
+  std::string get_device_version();
+  /** Creates a kernel from the kernel library.
+   *
+   * @param[in] kernel_name       Kernel name.
+   * @param[in] build_options_set Kernel build options as a set.
+   *
+   * @return The created kernel.
+   */
+  Kernel create_kernel(const std::string &kernel_name,
+                       const StringSet &build_options_set = {}) const;
+  /** Find the maximum number of local work items in a workgroup can be supported for the kernel.
+   *
+   */
+  size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
+  /** Return the default NDRange for the device.
+   *
+   */
+  cl::NDRange default_ndrange() const;
+
+  /** Clear the library's cache of binary programs
+   */
+  void clear_programs_cache()
+  {
+    _programs_map.clear();
+    _built_programs_map.clear();
+  }
+
+  /** Access the cache of built OpenCL programs */
+  const std::map<std::string, cl::Program> &get_built_programs() const
+  {
+    return _built_programs_map;
+  }
+
+  /** Add a new built program to the cache
+   *
+   * @param[in] built_program_name Name of the program
+   * @param[in] program            Built program to add to the cache
+   */
+  void add_built_program(const std::string &built_program_name, cl::Program program);
+
+private:
+  /** Load program and its dependencies.
+   *
+   * @param[in] program_name Name of the program to load.
+   */
+  const Program &load_program(const std::string &program_name) const;
+  /** Concatenates contents of a set into a single string.
+   *
+   * @param[in] s Input set to concatenate.
+   *
+   * @return Concatenated string.
+   */
+  std::string stringify_set(const StringSet &s) const;
+
+  cl::Context _context;     /**< Underlying CL context. */
+  cl::Device _device;       /**< Underlying CL device. */
+  std::string _kernel_path; /**< Path to the kernels folder. */
+  mutable std::map<std::string, const Program>
+      _programs_map; /**< Map with all already loaded program data. */
+  mutable std::map<std::string, cl::Program>
+      _built_programs_map; /**< Map with all already built program data. */
+  static const std::map<std::string, std::string>
+      _kernel_program_map; /**< Map that associates kernel names with programs. */
+  static const std::map<std::string, std::string>
+      _program_source_map; /**< Contains sources for all programs.
+                                Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
new file mode 100644
index 000000000..6bd33bf8f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
+#define __ARM_COMPUTE_CLCASTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a cast operation */
+class CLCastKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLCastKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLCastKernel(const CLCastKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLCastKernel &operator=(const CLCastKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLCastKernel(CLCastKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLCastKernel &operator=(CLCastKernel &&) = default;
+  /** Default destructor */
+  ~CLCastKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
new file mode 100644
index 000000000..a51441aca
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__
+#define __ARM_COMPUTE_CLGATHERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the gather kernel.
+ *
+ */
+class CLGatherKernel : public ICLKernel
+{
+public:
+  /** Default constructor.*/
+  CLGatherKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLGatherKernel(const CLGatherKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLGatherKernel &operator=(const CLGatherKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLGatherKernel(CLGatherKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLGatherKernel &operator=(CLGatherKernel &&) = default;
+  /** Initialise the kernel's input, output and border mode.
+   *
+   * @param[in]  input1          An input tensor. Data types supported: U8/S32/F32.
+   * @param[in]  input2          An input tensor. Data types supported: S32.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLGatherKernel
+   *
+   * @param[in]  input1          An input tensor. Data types supported: U8/S32/F32.
+   * @param[in]  input2          An input tensor. Data types supported: S32.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGATHERKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
new file mode 100644
index 000000000..cd2b255bc
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
+#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pixelwise division kernel.
+ *
+ */
+class CLPixelWiseDivisionKernel : public ICLKernel
+{
+public:
+  /** Default constructor.*/
+  CLPixelWiseDivisionKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default;
+  /** Initialise the kernel's input, output and border mode.
+   *
+   * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1. Note:
+   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * @param[in]  scale           Scale to apply after division.
+   *                             Scale must be positive and its value must be either 1/255 or 1/2^n
+   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+   * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+   * even.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPixelWiseDivisionKernel
+   *
+   * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * @param[in] scale           Scale to apply after division.
+   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
+   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
+                         RoundingPolicy rounding_policy);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
new file mode 100644
index 000000000..a7d96cc5c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pixelwise division kernel.
+ *
+ */
+class CLReduceMaxKernel : public ICLKernel
+{
+public:
+  /** Default constructor.*/
+  CLReduceMaxKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLReduceMaxKernel(const CLReduceMaxKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLReduceMaxKernel(CLReduceMaxKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
+  /** Initialise the kernel's input, output and border mode.
+   *
+   * @param[in]  input          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] axis            Axis to reduce
+   * @param[out] output          The output tensor, Data types supported: same as @p input1. Note:
+   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   */
+  void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLReduceMaxKernel
+   *
+   * @param[in] input           An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] axis            Axis to reduce
+   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+  void run_on_cpu(cl::CommandQueue &queue);
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  int32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
new file mode 100644
index 000000000..de9df3381
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel */
+class CLReductionMeanKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLReductionMeanKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLReductionMeanKernel(const CLReductionMeanKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLReductionMeanKernel(CLReductionMeanKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default;
+  /** Default destructor */
+  ~CLReductionMeanKernel() = default;
+
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+   * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+   *                    Output will have the same number of dimensions as input.
+   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0, 1
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLReductionMeanKernel.
+   *
+   * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+   * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   *                   Output will have the same number of dimensions as input.
+   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0, 1
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         std::vector<uint32_t> axis);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  std::vector<uint32_t> _reduction_axis;
+  BorderSize _border_size;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
new file mode 100644
index 000000000..248ae6635
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to extract a strided slice of a tensor */
+class CLStridedSliceKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLStridedSliceKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
+  /** Default destructor */
+  ~CLStridedSliceKernel() = default;
+  /** Set the input and output of the kernel
+   *
+   * @param[in]  input          Source tensor. Data type supported:
+   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  beginData      The begin tensor. Data types supported: S32.
+   *                            The number of dimensions must be 1.
+   *                            The length must be the same as the number of dimensions of input.
+   * @param[in]  endData        The end tensor. Data types supported: S32.
+   *                            The number of dimensions must be 1.
+   *                            The length must be the same as the number of dimensions of input.
+   * @param[in]  strideData     The stride tensor. Data types supported: S32.
+   *                            The number of dimensions must be 1.
+   *                            The length must be the same as the number of dimensions of input.
+   * @param[in]  beginMask      Mask for begin
+   * @param[in]  endMask        Mask for end
+   * @param[in]  shrinkAxisMask Mask for shrink axis.
+   *
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                 int32_t shrinkAxisMask);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLStridedSliceKernel
+   *
+   * @param[in]  input          The input tensor info. Data types supported:
+   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+   * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
+   * @param[in]  begin          The begin tensor info. Data types supported: S32.
+   *                            The number of dimensions must be 1.
+   *                            The length must be the same as the number of dimensions of input.
+   * @param[in]  end            The end tensor info. Data types supported: S32.
+   *                            The number of dimensions must be 1.
+   *                            The length must be the same as the number of dimensions of input.
+   * @param[in]  stride         The stride tensor info. Data types supported: S32.
+   *                            The number of dimensions must be 1.
+   *                            The length must be the same as the number of dimensions of input.
+   * @param[in]  beginMask      Mask for begin
+   * @param[in]  endMask        Mask for end
+   * @param[in]  shrinkAxisMask Mask for shrink axis.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *begin, const ITensorInfo *end,
+                         const ITensorInfo *stride, int32_t beginMask, int32_t endMask,
+                         int32_t shrinkAxisMask);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /** Source tensor */
+  ICLTensor *_output;      /** Destination tensor */
+  ICLTensor *_beginData;   /** Start indices of input tensor */
+  ICLTensor *_endData;     /** Stop indices of input tensor */
+  ICLTensor *_stridesData; /** Strides tensor */
+  int32_t _beginMask;      /** Begin mask */
+  int32_t _endMask;        /** End mask */
+  int32_t _shrinkAxisMask; /** Shrink axis mask */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
new file mode 100644
index 000000000..5c567f38e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <array>
+
+// these parameters can be changed
+#define _ITEMS 16                          // number of items in a group
+#define _GROUPS 4                          // the number of virtual processors is _ITEMS * _GROUPS
+#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram
+#define PERMUT                             // store the final permutation
+////////////////////////////////////////////////////////
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLTopKV2Single : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTopKV2Single();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2Single(const CLTopKV2Single &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTopKV2Single(CLTopKV2Single &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
+
+  void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+                 cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_topk_values;
+  ICLTensor *_topk_indices;
+};
+
+class CLTopKV2Init : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTopKV2Init();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2Init(const CLTopKV2Init &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTopKV2Init(CLTopKV2Init &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
+
+  void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+};
+
+class CLRadixSortHistogram : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLRadixSortHistogram();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
+  /** Allow instances of this class to be moved */
+  CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
+  /** Allow instances of this class to be moved */
+  CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
+
+  void configure(cl::Buffer *hist_buf, int bits, int n);
+
+  void setPass(int pass, cl::Buffer *in_key_buf)
+  {
+    _pass = pass;
+    _in_key_buf = in_key_buf;
+  }
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  int _pass;
+  cl::Buffer *_in_key_buf;
+};
+
+class CLRadixSortScanHistogram : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLRadixSortScanHistogram();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
+  /** Allow instances of this class to be moved */
+  CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
+  /** Allow instances of this class to be moved */
+  CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
+
+  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+class CLRadixSortGlobalScanHistogram : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLRadixSortGlobalScanHistogram();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
+  /** Allow instances of this class to be moved */
+  CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
+  /** Allow instances of this class to be moved */
+  CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
+
+  void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+class CLRadixSortPasteHistogram : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLRadixSortPasteHistogram();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
+  /** Allow instances of this class to be moved */
+  CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
+  /** Allow instances of this class to be moved */
+  CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
+
+  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+class CLRadixSortReorder : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLRadixSortReorder();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortReorder(const CLRadixSortReorder &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
+  /** Allow instances of this class to be moved */
+  CLRadixSortReorder(CLRadixSortReorder &&) = default;
+  /** Allow instances of this class to be moved */
+  CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
+
+  void configure(cl::Buffer *hist_buf, int bits, int n);
+
+  void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+               cl::Buffer *out_ind_buf)
+  {
+    _pass = pass;
+    _in_key_buf = in_key_buf;
+    _out_key_buf = out_key_buf;
+    _in_ind_buf = in_ind_buf;
+    _out_ind_buf = out_ind_buf;
+  }
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  int _pass;
+  cl::Buffer *_in_key_buf;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_in_ind_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+class CLTopKV2FindFirstNegative : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTopKV2FindFirstNegative();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
+
+  void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+  void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  cl::Buffer *_out_key_buf;
+};
+
+class CLTopKV2ReorderNegatives : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTopKV2ReorderNegatives();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
+
+  void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+  void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+                  cl::Buffer *out_ind_buf)
+  {
+    _in_key_buf = in_key_buf;
+    _out_key_buf = out_key_buf;
+    _in_ind_buf = in_ind_buf;
+    _out_ind_buf = out_ind_buf;
+  }
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  cl::Buffer *_in_key_buf;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_in_ind_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+class CLTopKV2Store : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTopKV2Store();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2Store(const CLTopKV2Store &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTopKV2Store(CLTopKV2Store &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
+
+  void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
+
+  void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_values;
+  ICLTensor *_indices;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
new file mode 100644
index 000000000..63050067d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCAST_H__
+#define __ARM_COMPUTE_CLCAST_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLCastKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLCast : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   *                       The input tensor is [in, out] because its TensorInfo might be modified
+   * inside the kernel.
+   * @param[out]     output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
new file mode 100644
index 000000000..3ae7afe14
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLGATHER_H__
+#define __ARM_COMPUTE_CLGATHER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLGatherKernel. */
+class CLGather : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and convertion policy.
+   *
+   * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
+   * @param[in] input2          An indexes tensor. Data types supported: S32.
+   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref CLGather
+   *
+   * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
+   * @param[in] input2          An indexes tensor. Data types supported: S32.
+   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLGATHER_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
new file mode 100644
index 000000000..c1383e21f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPixelWiseDivisionKernel. */
+class CLPixelWiseDivision : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and convertion policy.
+   *
+   * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   *                                 The input tensor is [in, out] because its TensorInfo might be
+   * modified inside the kernel in case of broadcasting of dimension 0.
+   * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+   *                                 The input tensor is [in, out] because its TensorInfo might be
+   * modified inside the kernel in case of broadcasting of dimension 0.
+   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * @param[in]      scale           Scale to apply after multiplication.
+   *                                 Scale must be positive and its value must be either 1/255 or
+   * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+   * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+   * even.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
+                 ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+                 RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPixelWiseDivision
+   *
+   * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * @param[in] scale           Scale to apply after multiplication.
+   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
+   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, float scale = 1.f,
+                         ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+                         RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+};
+}
+#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
new file mode 100644
index 000000000..14b473f33
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
+#define __ARM_COMPUTE_CLREDUCE_MAX_H__
+
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLTopKV2Kernel
+ */
+class CLReduceMax : public IFunction
+{
+public:
+  /** Constructor */
+  CLReduceMax();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLReduceMax(const CLReduceMax &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLReduceMax &operator=(const CLReduceMax &) = delete;
+  /** Allow instances of this class to be moved */
+  CLReduceMax(CLReduceMax &&) = default;
+  /** Allow instances of this class to be moved */
+  CLReduceMax &operator=(CLReduceMax &&) = default;
+  /** Initialise the kernel's inputs and outputs.
+   *
+   * @note When locations of min and max occurrences are requested, the reported number of locations
+   * is limited to the given array size.
+   *
+   * @param[in]  input     Input image. Data types supported: F32
+   * @param[in]  axis      Axis to reduce. Data type supported: S32
+   * @param[out] output    indices related to top k values. Data types supported: F32.
+   */
+  void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPixelWiseDivision
+   *
+   * @param[in]  input     Input image. Data types supported: F32
+   * @param[in]  axis      Axis to reduce. Data type supported: S32
+   * @param[out] output    indices related to top k values. Data types supported: F32.     *
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  void run_on_cpu();
+
+  int32_t _axis;
+
+  ICLTensor *_input;
+  ICLTensor *_output;
+
+  std::unique_ptr<ICLKernel> _kernel;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
new file mode 100644
index 000000000..2081518c1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__
+#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Perform reduction operation.
+ */
+class CLReductionMean : public IFunction
+{
+public:
+  /** Default Constructor.
+   */
+  CLReductionMean();
+
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+   * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1
+   */
+  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLReductionMean.
+   *
+   * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+   * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         std::vector<uint32_t> axis);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLReductionMeanKernel _reduction_mean_kernel;
+  CLFillBorderKernel _fill_border_kernel;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
new file mode 100644
index 000000000..f223a79be
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLStridedSliceKernel */
+class CLStridedSlice : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input  First tensor input. Data type supported:
+   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+   * @param[out] output Output tensor. Data type supported: Same as @p input
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                 int32_t shrinkAxisMask);
+};
+
+class CLStridedSliceCPU : public IFunction
+{
+public:
+  /** Initialise inputs and outputs
+   *
+   * @param[in]  input  First tensor input.
+   * @param[out] output Output tensor.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData,
+                 ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                 int32_t shrinkAxisMask);
+
+  void run() override;
+
+private:
+  void run_on_cpu();
+
+  ICLTensor *_input;
+  ICLTensor *_output;
+  ICLTensor *_beginData;
+  ICLTensor *_endData;
+  ICLTensor *_stridesData;
+  int32_t _beginMask;
+  int32_t _endMask;
+  int32_t _shrinkAxisMask;
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
new file mode 100644
index 000000000..06cd1ee9b
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
+#define __ARM_COMPUTE_CLTOPK_V2_H__
+
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLTopKV2Kernel
+ */
+class CLTopKV2 : public IFunction
+{
+public:
+  /** Constructor */
+  CLTopKV2();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2(const CLTopKV2 &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTopKV2 &operator=(const CLTopKV2 &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTopKV2(CLTopKV2 &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTopKV2 &operator=(CLTopKV2 &&) = default;
+  /** Initialise the kernel's inputs and outputs.
+   *
+   * @note When locations of min and max occurrences are requested, the reported number of locations
+   * is limited to the given array size.
+   *
+   * @param[in]  input     Input image. Data types supported: U8/S16/F32.
+   * @param[in]  k         The value of `k`.
+   * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if
+   * input type is F32.
+   * @param[out] indices   indices related to top k values. Data types supported: S32 if input type
+   * is U8/S16, F32 if input type is F32.
+   */
+  void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                 int total_bits = 32, int bits = 4);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  void run_on_cpu();
+  void run_on_gpu();
+  void run_on_gpu_single_quicksort();
+
+  uint32_t _k;
+  uint32_t _total_bits;
+  uint32_t _bits;
+  uint32_t _radix;
+  uint32_t _hist_buf_size;
+  uint32_t _glob_sum_buf_size;
+  uint32_t _n;
+
+  ICLTensor *_input;
+  ICLTensor *_values;
+  ICLTensor *_indices;
+
+  cl::Buffer _qs_idx_buf;
+  cl::Buffer _qs_temp_buf;
+  cl::Buffer _hist_buf;
+  cl::Buffer _glob_sum_buf;
+  cl::Buffer _temp_buf;
+  cl::Buffer _first_negative_idx_buf;
+  cl::Buffer _in_key_buf;
+  cl::Buffer _out_key_buf;
+  cl::Buffer _in_ind_buf;
+  cl::Buffer _out_ind_buf;
+
+  cl::Buffer *_p_in_key_buf;
+  cl::Buffer *_p_out_key_buf;
+  cl::Buffer *_p_in_ind_buf;
+  cl::Buffer *_p_out_ind_buf;
+
+  CLTopKV2Single _qs_kernel;
+  CLTopKV2Init _init_kernel;
+  CLRadixSortHistogram _hist_kernel;
+  CLRadixSortScanHistogram _scan_hist_kernel;
+  CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
+  CLRadixSortPasteHistogram _paste_hist_kernel;
+  CLRadixSortReorder _reorder_kernel;
+  CLTopKV2FindFirstNegative _find_first_negative_kernel;
+  CLTopKV2ReorderNegatives _reorder_negatives_kernel;
+  CLTopKV2Store _store_kernel;
+};
+}
+#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/libs/ARMComputeEx/resolve_includes.py b/libs/ARMComputeEx/resolve_includes.py
new file mode 100644
index 000000000..b3e252892
--- /dev/null
+++ b/libs/ARMComputeEx/resolve_includes.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+import os.path
+import re
+import subprocess
+import glob
+
+
+def resolve_includes(target, source):
+    # File collection
+    FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
+
+    # Include pattern
+    pattern = re.compile("#include \"(.*)\"")
+
+    # Get file contents
+    files = []
+    for i in range(len(source)):
+        src = source[i]
+        dst = target[i]
+        f = open(src)
+        cts = f.read()
+        f.close()
+        contents = cts.splitlines()
+        entry = FileEntry(target_name=dst, file_contents=contents)
+        files.append((os.path.basename(src), entry))
+
+    # Create dictionary of tupled list
+    files_dict = dict(files)
+
+    # Check for includes (can only be files in the same folder)
+    final_files = []
+    for file in files:
+        done = False
+        tmp_file = file[1].file_contents
+        print(file[1].target_name)
+        while not done:
+            file_count = 0
+            updated_file = []
+            for line in tmp_file:
+                found = pattern.search(line)
+                if found:
+                    include_file = found.group(1)
+                    data = files_dict[include_file].file_contents
+                    updated_file.extend(data)
+                else:
+                    updated_file.append(line)
+                    file_count += 1
+
+            # Check if all include are replaced.
+            if file_count == len(tmp_file):
+                done = True
+
+            # Update temp file
+            tmp_file = updated_file
+
+        # Append and prepend string literal identifiers and add expanded file to final list
+        tmp_file.insert(0, "R\"(\n")
+        tmp_file.append("\n)\"")
+        entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
+        final_files.append((file[0], entry))
+
+    # Write output files
+    for file in final_files:
+        with open(file[1].target_name, 'w+') as out_file:
+            out_file.write("\n".join(file[1].file_contents))
+
+
+# Generate embed files
+cl_files = glob.glob('src/core/CL/cl_kernels/*.cl')
+cl_files += glob.glob('src/core/CL/cl_kernels/*.h')
+
+# DEBUG: print cl files
+print("cl_files:")
+print(cl_files)
+
+embed_files = [f + "embed" for f in cl_files]
+print("embed_files:")
+print(embed_files)
+
+resolve_includes(embed_files, cl_files)
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 000000000..d535c5da4
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+    {"absdiff", "absdiff.cl"},
+    {"accumulate", "accumulate.cl"},
+    {"accumulate_squared", "accumulate.cl"},
+    {"accumulate_weighted", "accumulate.cl"},
+    {"activation_layer", "activation_layer.cl"},
+    {"activation_layer_qa8", "activation_layer_qa8.cl"},
+    {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"},
+    {"arithmetic_add", "arithmetic_op.cl"},
+    {"arithmetic_sub", "arithmetic_op.cl"},
+    {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+    {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"},
+    {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"},
+    {"bitwise_or", "bitwise_op.cl"},
+    {"bitwise_and", "bitwise_op.cl"},
+    {"bitwise_xor", "bitwise_op.cl"},
+    {"bitwise_not", "bitwise_op.cl"},
+    {"cast", "cast.cl"},
+    {"cast_qasymm_in", "cast.cl"},
+    {"cast_qasymm_out", "cast.cl"},
+    {"channel_combine_NV", "channel_combine.cl"},
+    {"channel_combine_RGB888", "channel_combine.cl"},
+    {"channel_combine_RGBA8888", "channel_combine.cl"},
+    {"channel_combine_UYVY422", "channel_combine.cl"},
+    {"channel_combine_YUYV422", "channel_combine.cl"},
+    {"channel_shuffle_nchw", "channel_shuffle.cl"},
+    {"channel_extract_NV12", "channel_extract.cl"},
+    {"channel_extract_NV21", "channel_extract.cl"},
+    {"channel_extract_RGB888", "channel_extract.cl"},
+    {"channel_extract_RGBA8888", "channel_extract.cl"},
+    {"channel_extract_UYVY422", "channel_extract.cl"},
+    {"channel_extract_YUYV422", "channel_extract.cl"},
+    {"combine_gradients_L1", "canny.cl"},
+    {"combine_gradients_L2", "canny.cl"},
+    {"concatenate_depth", "concatenate.cl"},
+    {"concatenate_width", "concatenate.cl"},
+    {"convolution_rectangle", "convolution_rectangle.cl"},
+    {"col2im", "col2im.cl"},
+    {"convert_depth_down", "depth_convert.cl"},
+    {"convert_depth_up", "depth_convert.cl"},
+    {"convert_fc_weights", "convert_fc_weights.cl"},
+    {"convolution3x3_static", "convolution3x3.cl"},
+    {"convolution5x5_static", "convolution5x5.cl"},
+    {"convolution7x7_static", "convolution7x7.cl"},
+    {"convolution9x9_static", "convolution9x9.cl"},
+    {"convolution_separable1x5_static", "convolution5x5.cl"},
+    {"convolution_separable5x1_static", "convolution5x5.cl"},
+    {"convolution_separable1x7_static", "convolution7x7.cl"},
+    {"convolution_separable7x1_static", "convolution7x7.cl"},
+    {"convolution_separable1x9_static", "convolution9x9.cl"},
+    {"convolution_separable9x1_static", "convolution9x9.cl"},
+    {"copy_tensor", "copy_tensor.cl"},
+    {"copy_plane", "channel_extract.cl"},
+    {"copy_planes_3p", "channel_combine.cl"},
+    {"copy_to_keypoint", "fast_corners.cl"},
+    {"deconvolution_upsample", "deconvolution_layer.cl"},
+    {"depthwise_convolution_3x3", "depthwise_convolution.cl"},
+    {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"},
+    {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"},
+    {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"},
+    {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"},
+    {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"},
+    {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"},
+    {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"},
+    {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"},
+    {"depthwise_im2col", "depthwise_convolution.cl"},
+    {"depthwise_vector_to_tensor", "depthwise_convolution.cl"},
+    {"depthwise_weights_reshape", "depthwise_convolution.cl"},
+    {"dequantization_layer", "dequantization_layer.cl"},
+    {"derivative", "derivative.cl"},
+    {"dilate", "dilate.cl"},
+    {"direct_convolution1x1", "direct_convolution1x1.cl"},
+    {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"},
+    {"direct_convolution3x3", "direct_convolution3x3.cl"},
+    {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"},
+    {"direct_convolution5x5", "direct_convolution5x5.cl"},
+    {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"},
+    {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+    {"erode", "erode.cl"},
+    {"fast_corners", "fast_corners.cl"},
+    {"fill_image_borders_constant", "fill_border.cl"},
+    {"fill_image_borders_replicate", "fill_border.cl"},
+    {"finalize", "optical_flow_pyramid_lk.cl"},
+    {"floor_layer", "floor.cl"},
+    {"gather", "gather.cl"},
+    {"gather_1d", "gather.cl"},
+    {"gather_1d_out", "gather.cl"},
+    {"gaussian1x5_sub_x", "gaussian_pyramid.cl"},
+    {"gaussian5x1_sub_y", "gaussian_pyramid.cl"},
+    {"gemm_accumulate_biases", "gemm.cl"},
+    {"gemm_interleave4x4", "gemm.cl"},
+    {"gemm_ma_f16", "gemm.cl"},
+    {"gemm_ma_f32", "gemm.cl"},
+    {"gemm_ma_qs8", "gemm.cl"},
+    {"gemm_ma_qs16", "gemm.cl"},
+    {"gemm_mv", "gemv.cl"},
+    {"gemm_mv_quantized", "gemv.cl"},
+    {"gemm_mm_interleaved_transposed_f16", "gemm.cl"},
+    {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"},
+    {"gemm_mm_interleaved_transposed_f32", "gemm.cl"},
+    {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"},
+    {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"},
+    {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"},
+    {"gemm_mm_floating_point", "gemm.cl"},
+    {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"},
+    {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"},
+    {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"},
+    {"gemm_mm_qs8", "gemm.cl"},
+    {"gemm_mm_qs16", "gemm.cl"},
+    {"gemm_lc_vm_f32", "gemm.cl"},
+    {"gemm_transpose1xW", "gemm.cl"},
+    {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"},
+    {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"},
+    {"gemmlowp_mm_bifrost", "gemmlowp.cl"},
+    {"gemmlowp_mm_midgard", "gemmlowp.cl"},
+    {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"},
+    {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"},
+    {"gemmlowp_offset_contribution", "gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"},
+    {"harris_score_3x3", "harris_corners.cl"},
+    {"harris_score_5x5", "harris_corners.cl"},
+    {"harris_score_7x7", "harris_corners.cl"},
+    {"hist_border_kernel", "histogram.cl"},
+    {"hist_border_kernel_fixed", "histogram.cl"},
+    {"hist_local_kernel", "histogram.cl"},
+    {"hist_local_kernel_fixed", "histogram.cl"},
+    {"hog_block_normalization", "hog.cl"},
+    {"hog_detector", "hog.cl"},
+    {"hog_orientation_binning", "hog.cl"},
+    {"hysteresis", "canny.cl"},
+    {"im2col1x1_stridex1_dchw", "im2col.cl"},
+    {"im2col3x3_dchw", "im2col.cl"},
+    {"im2col5x5_dchw", "im2col.cl"},
+    {"im2col11x11_padx0_pady0_dchw", "im2col.cl"},
+    {"im2col_generic_dchw", "im2col.cl"},
+    {"im2col_generic_padx0_pady0_dchw", "im2col.cl"},
+    {"im2col_reduced_dchw", "im2col.cl"},
+    {"init_level", "optical_flow_pyramid_lk.cl"},
+    {"init_level_max", "optical_flow_pyramid_lk.cl"},
+    {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"},
+    {"integral_horizontal", "integral_image.cl"},
+    {"integral_vertical", "integral_image.cl"},
+    {"IYUV_to_NV12_bt709", "color_convert.cl"},
+    {"IYUV_to_RGB888_bt709", "color_convert.cl"},
+    {"IYUV_to_RGBA8888_bt709", "color_convert.cl"},
+    {"IYUV_to_YUV444_bt709", "color_convert.cl"},
+    {"l2_normalize", "l2_normalize.cl"},
+    {"lktracker_stage0", "optical_flow_pyramid_lk.cl"},
+    {"lktracker_stage1", "optical_flow_pyramid_lk.cl"},
+    {"magnitude_phase", "magnitude_phase.cl"},
+    {"mean_stddev_accumulate", "mean_stddev.cl"},
+    {"minmax", "minmaxloc.cl"},
+    {"minmax_border", "minmaxloc.cl"},
+    {"minmax_layer", "minmax_layer.cl"},
+    {"minmaxloc", "minmaxloc.cl"},
+    {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"},
+    {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"},
+    {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"},
+    {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"},
+    {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"},
+    {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"},
+    {"non_max_suppression", "nonmax.cl"},
+    {"normalization_layer_cross_map", "normalization_layer.cl"},
+    {"normalization_layer_in_map", "normalization_layer.cl"},
+    {"NV12_to_IYUV_bt709", "color_convert.cl"},
+    {"NV12_to_RGB888_bt709", "color_convert.cl"},
+    {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
+    {"NV12_to_YUV444_bt709", "color_convert.cl"},
+    {"NV21_to_IYUV_bt709", "color_convert.cl"},
+    {"NV21_to_RGB888_bt709", "color_convert.cl"},
+    {"NV21_to_RGBA8888_bt709", "color_convert.cl"},
+    {"NV21_to_YUV444_bt709", "color_convert.cl"},
+    {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+    {"permute_201", "permute.cl"},
+    {"permute_120", "permute.cl"},
+    {"permute_3201", "permute.cl"},
+    {"pixelwise_mul_float", "pixelwise_mul_float.cl"},
+    {"pixelwise_mul_int", "pixelwise_mul_int.cl"},
+    {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+    {"pixelwise_div_float", "pixelwise_div_float.cl"},
+    {"pixelwise_div_int", "pixelwise_div_int.cl"},
+    {"pooling_layer_2", "pooling_layer.cl"},
+    {"pooling_layer_3", "pooling_layer.cl"},
+    {"pooling_layer_optimized_3", "pooling_layer.cl"},
+    {"pooling_layer_7", "pooling_layer.cl"},
+    {"pooling_layer_MxN_nchw", "pooling_layer.cl"},
+    {"pooling_layer_MxN_nhwc", "pooling_layer.cl"},
+    {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"},
+    {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
+    {"quantization_layer", "quantization_layer.cl"},
+    {"reduce_max", "reduce_max.cl"},
+    {"reduction_operation", "reduction_operation.cl"},
+    {"reduction_mean", "reduction_mean.cl"},
+    {"remap_nearest_neighbour", "remap.cl"},
+    {"remap_bilinear", "remap.cl"},
+    {"reshape_layer", "reshape_layer.cl"},
+    {"reshape_to_columns", "convolution_layer.cl"},
+    {"RGB888_to_IYUV_bt709", "color_convert.cl"},
+    {"RGB888_to_NV12_bt709", "color_convert.cl"},
+    {"RGB888_to_RGBA8888_bt709", "color_convert.cl"},
+    {"RGB888_to_YUV444_bt709", "color_convert.cl"},
+    {"RGBA8888_to_IYUV_bt709", "color_convert.cl"},
+    {"RGBA8888_to_NV12_bt709", "color_convert.cl"},
+    {"RGBA8888_to_RGB888_bt709", "color_convert.cl"},
+    {"RGBA8888_to_YUV444_bt709", "color_convert.cl"},
+    {"roi_pooling_layer", "roi_pooling_layer.cl"},
+    {"scale_nearest_neighbour", "scale.cl"},
+    {"scale_bilinear", "scale.cl"},
+    {"scharr3x3", "scharr_filter.cl"},
+    {"sobel3x3", "sobel_filter.cl"},
+    {"sobel_separable5x1", "sobel_filter.cl"},
+    {"sobel_separable1x5", "sobel_filter.cl"},
+    {"sobel_separable7x1", "sobel_filter.cl"},
+    {"sobel_separable1x7", "sobel_filter.cl"},
+    {"softmax_layer_norm", "softmax_layer.cl"},
+    {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"},
+    {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"},
+    {"strided_slice", "strided_slice.cl"},
+    {"suppress_non_maximum", "canny.cl"},
+    {"tablelookup_U8", "tablelookup.cl"},
+    {"tablelookup_S16", "tablelookup.cl"},
+    {"threshold_binary", "threshold.cl"},
+    {"threshold_range", "threshold.cl"},
+    {"transpose", "transpose.cl"},
+    {"UYVY422_to_IYUV_bt709", "color_convert.cl"},
+    {"UYVY422_to_NV12_bt709", "color_convert.cl"},
+    {"UYVY422_to_RGB888_bt709", "color_convert.cl"},
+    {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"},
+    {"warp_affine_nearest_neighbour", "warp_affine.cl"},
+    {"warp_affine_bilinear", "warp_affine.cl"},
+    {"warp_perspective_nearest_neighbour", "warp_perspective.cl"},
+    {"warp_perspective_bilinear", "warp_perspective.cl"},
+    {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"},
+    {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"},
+    {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"},
+    {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"},
+    {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"},
+    {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"},
+    {"YUYV422_to_IYUV_bt709", "color_convert.cl"},
+    {"YUYV422_to_NV12_bt709", "color_convert.cl"},
+    {"YUYV422_to_RGB888_bt709", "color_convert.cl"},
+    {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"},
+    {"topkv2_init", "topkv2.cl"},
+    {"topkv2_find_first_negative", "topkv2.cl"},
+    {"topkv2_reorder_negatives", "topkv2.cl"},
+    {"topkv2_store", "topkv2.cl"},
+    {"radixsort_histogram", "topkv2_radixsort.cl"},
+    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+    {"radixsort_reorder", "topkv2_radixsort.cl"},
+    {"topkv2_quicksort", "topkv2_quicksort.cl"},
+};
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+    {
+        "cast.cl",
+#include "./cl_kernels/cast.clembed"
+    },
+    {
+        "fixed_point.h",
+#include "./cl_kernels/fixed_point.hembed"
+    },
+    {
+        "gather.cl",
+#include "./cl_kernels/gather.clembed"
+    },
+    {
+        "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+    },
+    {
+        "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+    },
+    {
+        "pixelwise_div_float.cl",
+#include "./cl_kernels/pixelwise_div_float.clembed"
+    },
+    {
+        "pixelwise_div_int.cl",
+#include "./cl_kernels/pixelwise_div_int.clembed"
+    },
+    {
+        "reduce_max.cl",
+#include "./cl_kernels/reduce_max.clembed"
+    },
+    {
+        "reduction_mean.cl",
+#include "./cl_kernels/reduction_mean.clembed"
+    },
+    {
+        "strided_slice.cl",
+#include "./cl_kernels/strided_slice.clembed"
+    },
+    {
+        "topkv2.cl",
+#include "./cl_kernels/topkv2.clembed"
+    },
+    {
+        "topkv2_radixsort.cl",
+#include "./cl_kernels/topkv2_radixsort.clembed"
+    },
+    {
+        "topkv2_quicksort.cl",
+#include "./cl_kernels/topkv2_quicksort.clembed"
+    },
+#endif /* EMBEDDED_KERNELS */
+};
+
+CLKernelLibraryEx::CLKernelLibraryEx()
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+  opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
+                         // CLKernelLibrary is built
+}
+
+CLKernelLibraryEx &CLKernelLibraryEx::get()
+{
+  static CLKernelLibraryEx _kernel_library;
+  return _kernel_library;
+}
+
+Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
+                                        const StringSet &build_options_set) const
+{
+  // Find which program contains the kernel
+  auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+  if (_kernel_program_map.end() == kernel_program_it)
+  {
+    ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+  }
+  std::string concat_str;
+
+  if (fp16_supported(_device))
+  {
+    concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+  }
+
+  if (get_cl_version(_device) == CLVersion::CL20)
+  {
+    concat_str += " -cl-std=CL2.0 ";
+  }
+  else if (arm_non_uniform_workgroup_supported(_device))
+  {
+    concat_str += " -cl-arm-non-uniform-work-group-size ";
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+  }
+
+  // Check if the program has been built before with same build options.
+  const std::string program_name = kernel_program_it->second;
+  const std::string build_options = stringify_set(build_options_set) + concat_str;
+
+  const std::string built_program_name = program_name + "_" + build_options;
+  auto built_program_it = _built_programs_map.find(built_program_name);
+
+  cl::Program cl_program;
+
+  if (_built_programs_map.end() != built_program_it)
+  {
+    // If program has been built, retrieve to create kernel from it
+    cl_program = built_program_it->second;
+  }
+  else
+  {
+    // Get program
+    Program program = load_program(program_name);
+
+    // Build program
+    cl_program = program.build(build_options);
+
+    // Add built program to internal map
+    _built_programs_map.emplace(built_program_name, cl_program);
+  }
+
+  // Create and return kernel
+  return Kernel(kernel_name, cl_program);
+}
+
+void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
+                                          cl::Program program)
+{
+  _built_programs_map.emplace(built_program_name, program);
+}
+
+const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
+{
+  const auto program_it = _programs_map.find(program_name);
+
+  if (program_it != _programs_map.end())
+  {
+    return program_it->second;
+  }
+
+  Program program;
+
+#ifdef EMBEDDED_KERNELS
+  const auto program_source_it = _program_source_map.find(program_name);
+
+  if (_program_source_map.end() == program_source_it)
+  {
+    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+  }
+
+  program = Program(_context, program_name, program_source_it->second);
+#else  /* EMBEDDED_KERNELS */
+  // Check for binary
+  std::string source_name = _kernel_path + program_name;
+  std::string binary_name = source_name + "bin";
+
+  if (std::ifstream(binary_name).is_open())
+  {
+    const std::string program_binary = read_file(binary_name, true);
+    program = Program(_context, _device, program_name,
+                      std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+  }
+  else if (std::ifstream(source_name).is_open())
+  {
+    program = Program(_context, program_name, read_file(source_name, false));
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+  }
+#endif /* EMBEDDED_KERNELS */
+
+  // Insert program to program map
+  const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+  return new_program.first->second;
+}
+
+std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
+{
+  std::string concat_set;
+
+#ifndef EMBEDDED_KERNELS
+  concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+  // Concatenate set
+  for (const auto &el : s)
+  {
+    concat_set += " " + el;
+  }
+
+  return concat_set;
+}
+
+std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
+{
+  const auto program_source_it = _program_source_map.find(program_name);
+
+  if (program_source_it == _program_source_map.end())
+  {
+    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+  }
+
+  return program_source_it->second;
+}
+
+size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
+{
+  size_t result;
+
+  size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+  ARM_COMPUTE_ERROR_ON_MSG(
+      err != 0,
+      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+  ARM_COMPUTE_UNUSED(err);
+
+  return result;
+}
+
+cl::NDRange CLKernelLibraryEx::default_ndrange() const
+{
+  cl::Device device = cl::Device::getDefault();
+  GPUTarget _target = get_target_from_device(device);
+  cl::NDRange default_range;
+
+  switch (_target)
+  {
+    case GPUTarget::MIDGARD:
+    case GPUTarget::T600:
+    case GPUTarget::T700:
+    case GPUTarget::T800:
+      default_range = cl::NDRange(128u, 1);
+      break;
+    default:
+      default_range = cl::NullRange;
+  }
+
+  return default_range;
+}
+
+std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
new file mode 100644
index 000000000..0c0a9ede6
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif /* FIXED_POINT_POSITION */
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed:
+ *
+ *  -# Add offset terms to inputs
+    -# Get scaled value of two inputs
+ *  -# Add inputs
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+ * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+ * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+ * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_add_qasymm8(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(int, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+    VEC_DATA_TYPE(int, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+    // Get scaled value of two inputs
+    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+
+    VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+    VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+    VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+
+    VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+    VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+
+    // Add inputs and multiply with a multiplier smaller than 1
+    VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+    VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+// TODO: Apply min-max BOUND to support fuse with relu.
+/*
+#if defined(MIN_BOUND)
+    res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+*/
+
+    // Store result
+    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
+                     0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..113804cca
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef SCALE_IN
+#define SCALE_IN    1.0f
+#endif
+#ifndef OFFSET_IN
+#define OFFSET_IN   0
+#endif
+
+/** Perform a cast operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+                             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+                     0, (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+
+/** Perform a cast operation on an QASYMM8 input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_qasymm_in(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
+        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
+    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+
+    VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+    VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+
+    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+                     0, (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+
+/** Perform a cast operation on an QASYMM8 output tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_qasymm_out(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
+        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
+    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+
+    VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+    VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+
+    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+                     0, (__global DATA_TYPE_OUT *)output.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
new file mode 100644
index 000000000..7807533e2
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_FIXED_POINT_H
+#define ARM_COMPUTE_FIXED_POINT_H
+
+#define TYPE_ALIAS(type, alias) \
+  typedef type alias;           \
+  typedef type alias##x##1;     \
+  typedef type##2 alias##x##2;  \
+  typedef type##3 alias##x##3;  \
+  typedef type##4 alias##x##4;  \
+  typedef type##8 alias##x##8;  \
+  typedef type##16 alias##x##16;
+
+TYPE_ALIAS(char, qs8)
+TYPE_ALIAS(short, qs16)
+TYPE_ALIAS(int, qs32)
+
+#define qs8_MIN ((char)CHAR_MIN)
+#define qs8_MAX ((char)CHAR_MAX)
+#define qs16_MIN ((short)SHRT_MIN)
+#define qs16_MAX ((short)SHRT_MAX)
+#define qs32_MIN ((int)INT_MIN)
+#define qs32_MAX ((int)INT_MAX)
+
+#define qu8_MIN ((uchar)0)
+#define qu8_MAX ((uchar)UCHAR_MAX)
+#define qu16_MIN ((ushort)0)
+#define qu16_MAX ((ushort)USHRT_MAX)
+#define qu32_MIN ((uint)0)
+#define qu32_MAX ((uint)UINT_MAX)
+
+#define qs8_TYPE char
+#define qs8x1_TYPE char
+#define qs8x2_TYPE char2
+#define qs8x3_TYPE char3
+#define qs8x4_TYPE char4
+#define qs8x8_TYPE char8
+#define qs8x16_TYPE char16
+
+#define qs16_TYPE short
+#define qs16x1_TYPE short
+#define qs16x2_TYPE short2
+#define qs16x3_TYPE short3
+#define qs16x4_TYPE short4
+#define qs16x8_TYPE short8
+#define qs16x16_TYPE short16
+
+#define qs32_TYPE int
+#define qs32x1_TYPE int
+#define qs32x2_TYPE int2
+#define qs32x3_TYPE int3
+#define qs32x4_TYPE int4
+#define qs32x8_TYPE int8
+#define qs32x16_TYPE int16
+
+/* All internal constants are represented in the maximum supported fixed point format (QS16),
+ * thus we define an additional shift parameter required to convert the constant
+ * from the maximum supported format to the require one.
+ */
+#define qs8_SHIFT 8
+#define qs16_SHIFT 0
+
+#undef VEC_DATA_TYPE_STR
+#undef VEC_DATA_TYPE
+#undef CONVERT_STR
+#undef CONVERT
+#undef CONVERT_SAT_STR
+#undef CONVERT_SAT
+
+#define VEC_DATA_TYPE_STR(type, size) type##x##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
+#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
+#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
+#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
+#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+/** Computes saturating absolute value of fixed point vector.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point absolute value.
+ */
+#define ABSQ_SAT_IMPL(type) \
+  inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); }
+
+ABSQ_SAT_IMPL(qs8x16)
+ABSQ_SAT_IMPL(qs16x8)
+
+#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
+#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
+
+/** Computes max of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point maximum.
+ */
+#define MAXQ_IMPL(type) \
+  inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); }
+
+MAXQ_IMPL(qs8x1)
+MAXQ_IMPL(qs8x2)
+MAXQ_IMPL(qs8x4)
+MAXQ_IMPL(qs8x8)
+MAXQ_IMPL(qs8x16)
+MAXQ_IMPL(qs16x1)
+MAXQ_IMPL(qs16x2)
+MAXQ_IMPL(qs16x4)
+MAXQ_IMPL(qs16x8)
+MAXQ_IMPL(qs16x16)
+
+#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
+#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated addition of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point addition. The result is saturated in case of overflow
+ */
+#define ADDQ_SAT_IMPL(type) \
+  inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); }
+
+ADDQ_SAT_IMPL(qs8x1)
+ADDQ_SAT_IMPL(qs8x2)
+ADDQ_SAT_IMPL(qs8x4)
+ADDQ_SAT_IMPL(qs8x8)
+ADDQ_SAT_IMPL(qs8x16)
+ADDQ_SAT_IMPL(qs16x1)
+ADDQ_SAT_IMPL(qs16x2)
+ADDQ_SAT_IMPL(qs16x4)
+ADDQ_SAT_IMPL(qs16x8)
+ADDQ_SAT_IMPL(qs16x16)
+ADDQ_SAT_IMPL(qs32x1)
+ADDQ_SAT_IMPL(qs32x2)
+ADDQ_SAT_IMPL(qs32x4)
+ADDQ_SAT_IMPL(qs32x8)
+ADDQ_SAT_IMPL(qs32x16)
+
+#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
+#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated subtraction of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point subtraction. The result is saturated in case of overflow
+ */
+#define SUBQ_SAT_IMPL(type) \
+  inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); }
+
+SUBQ_SAT_IMPL(qs8x1)
+SUBQ_SAT_IMPL(qs8x2)
+SUBQ_SAT_IMPL(qs8x4)
+SUBQ_SAT_IMPL(qs8x8)
+SUBQ_SAT_IMPL(qs8x16)
+SUBQ_SAT_IMPL(qs16x1)
+SUBQ_SAT_IMPL(qs16x2)
+SUBQ_SAT_IMPL(qs16x4)
+SUBQ_SAT_IMPL(qs16x8)
+SUBQ_SAT_IMPL(qs16x16)
+
+#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
+#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/* Multiply of two fixed point numbers
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication.
+ */
+#define MULQ_IMPL(type, itype)                                               \
+  inline type mul_##type(type VopA, type VopB, int fixed_point_position)     \
+  {                                                                          \
+    itype round_val = (itype)(1 << (fixed_point_position - 1));              \
+    itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
+    return CONVERT((res >> (itype)fixed_point_position), type);              \
+  }
+
+MULQ_IMPL(qs8x8, qs16x8)
+MULQ_IMPL(qs16x8, qs32x8)
+MULQ_IMPL(qs8x16, qs16x16)
+MULQ_IMPL(qs16x16, qs32x16)
+
+#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
+#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
+
+/* Saturate multiply of two fixed point numbers
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication. The result is saturated in case of overflow
+ */
+#define MULQ_SAT_IMPL(type, itype)                                                  \
+  inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position)        \
+  {                                                                                 \
+    itype round_val = (itype)(1 << (fixed_point_position - 1));                     \
+    itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
+    return CONVERT_SAT((res >> (itype)fixed_point_position), type);                 \
+  }
+
+MULQ_SAT_IMPL(qs8x1, qs16x1)
+MULQ_SAT_IMPL(qs8x2, qs16x2)
+MULQ_SAT_IMPL(qs8x3, qs16x3)
+MULQ_SAT_IMPL(qs8x4, qs16x4)
+MULQ_SAT_IMPL(qs8x8, qs16x8)
+MULQ_SAT_IMPL(qs8x16, qs16x16)
+MULQ_SAT_IMPL(qs16x1, qs32x1)
+MULQ_SAT_IMPL(qs16x2, qs32x2)
+MULQ_SAT_IMPL(qs16x3, qs32x3)
+MULQ_SAT_IMPL(qs16x4, qs32x4)
+MULQ_SAT_IMPL(qs16x8, qs32x8)
+MULQ_SAT_IMPL(qs16x16, qs32x16)
+
+#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \
+  mul_sat_##type##x##size((a), (b), (position))
+#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \
+  MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate multiply-accumulate
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate. The result is saturated in case of
+ * overflow
+ */
+#define MLAQ_SAT_IMPL(type, itype)                                               \
+  type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
+  {                                                                              \
+    itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype),              \
+                        (itype)(1 << (fixed_point_position - 1)));               \
+    return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
+  }
+
+MLAQ_SAT_IMPL(qs8x8, qs16x8)
+MLAQ_SAT_IMPL(qs8x16, qs16x16)
+MLAQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
+  mla_sat_##type##x##size((a), (b), (c), (position))
+#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \
+  MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate multiply-accumulate long
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate long. The result is saturated in case
+ * of overflow
+ */
+#define MLALQ_SAT_IMPL(type, itype)                                                 \
+  itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
+  {                                                                                 \
+    itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype),                 \
+                        (itype)(1 << (fixed_point_position - 1)));                  \
+    return add_sat(VopA, res >> (itype)fixed_point_position);                       \
+  }
+
+MLALQ_SAT_IMPL(qs8x8, qs16x8)
+MLALQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
+  mlal_sat_##type##x##size((a), (b), (c), (position))
+#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \
+  MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate division of two fixed point vectors
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point division. The result is saturated in case of overflow
+ */
+#define DIVQ_SAT_IMPL(stype, type, itype)                                                          \
+  inline type div_sat_##type(type VopA, type VopB, int fixed_point_position)                       \
+  {                                                                                                \
+    itype conv_a = CONVERT((VopA), itype);                                                         \
+    itype denominator = CONVERT((VopB), itype);                                                    \
+    itype numerator = conv_a << (itype)(fixed_point_position);                                     \
+    itype res = select((itype)(numerator / denominator),                                           \
+                       select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \
+                       (itype)(denominator == (itype)0));                                          \
+    return CONVERT_SAT((res), type);                                                               \
+  }
+
+DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
+DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
+DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
+DIVQ_SAT_IMPL(qs8, qs8, qs16)
+DIVQ_SAT_IMPL(qs16, qs16, qs32)
+
+#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
+#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
+
+#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \
+  div_sat_##type##x##size((a), (b), (position))
+#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \
+  DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate exponential of a fixed point vector
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the exponential function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point exponential. The result is saturated in case of overflow
+ */
+#define EXPQ_IMPL(stype, type, size)                                                            \
+  inline type exp_sat_##type(type VopA, int fixed_point_position)                               \
+  {                                                                                             \
+    type const_one = (type)(1 << (fixed_point_position));                                       \
+    type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1);                    \
+    type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one;    \
+    type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1);                        \
+    type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1);                        \
+    type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1);                        \
+    type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1);                        \
+    type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position);               \
+    type dec_m = m >> (type)fixed_point_position;                                               \
+    type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size,       \
+                                   fixed_point_position);                                       \
+    alpha = CONVERT(abs_diff(VopA, alpha), type);                                               \
+    type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C);      \
+    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B);         \
+    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A);         \
+    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
+    return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0),      \
+                  clz(sum) > dec_m); /* Saturate result if needed */                            \
+  }
+
+EXPQ_IMPL(qs8, qs8x2, 2)
+EXPQ_IMPL(qs8, qs8x4, 4)
+EXPQ_IMPL(qs8, qs8x8, 8)
+EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x2, 2)
+EXPQ_IMPL(qs16, qs16x4, 4)
+EXPQ_IMPL(qs16, qs16x8, 8)
+EXPQ_IMPL(qs16, qs16x16, 16)
+
+#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
+#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate logarithm of a fixed point vector
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point logarithm. The result is saturated in case of overflow
+ */
+#define LOGQ_IMPL(stype, type, size)                                                            \
+  inline type log_sat_##type(type VopA, int fixed_point_position)                               \
+  {                                                                                             \
+    type const_one = (type)(1 << (fixed_point_position));                                       \
+    type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */                   \
+    type A = (type)(0x5C0F >> (14 - fixed_point_position));   /* 1.4384189 */                   \
+    type B = -(type)(0x56AE >> (15 - fixed_point_position));  /* -0.6771900 */                  \
+    type C = (type)(0x2933 >> (15 - fixed_point_position));   /* 0.3218538 */                   \
+    type D = -(type)(0x0AA7 >> (15 - fixed_point_position));  /* -0.0832229 */                  \
+    type inter_a =                                                                              \
+        select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \
+               VopA < const_one);                                                               \
+    type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position);   \
+    inter_a = inter_a >> shift_val;                                                             \
+    inter_a = sub_sat(inter_a, const_one);                                                      \
+    type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C);    \
+    sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B);       \
+    sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A);       \
+    sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position);                   \
+    sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype,  \
+                            size, fixed_point_position);                                        \
+    return select(select(sum, -sum, VopA < const_one), (type)0,                                 \
+                  VopA < (type)0); /* Saturate result if needed */                              \
+  }
+
+LOGQ_IMPL(qs8, qs8x16, 16)
+LOGQ_IMPL(qs16, qs16x8, 8)
+LOGQ_IMPL(qs16, qs16x16, 16)
+
+#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
+#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate inverse square root of a fixed point vector
+ *
+ * @note Implemented approach uses Newton's method to approximate the inverse square root function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point inverse square root. The result is saturated in case of
+ * overflow
+ */
+#define INVSQRTQ_IMPL(stype, type, size)                                                           \
+  inline type invsqrt_sat_##type(type VopA, int fixed_point_position)                              \
+  {                                                                                                \
+    type const_three = (type)(3 << (fixed_point_position));                                        \
+    type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position);      \
+    type temp = select((type)(VopA >> shift_value),                                                \
+                       select((type)stype##_MAX, (type)(VopA << (-shift_value)),                   \
+                              (type)(clz(VopA) > (-shift_value))),                                 \
+                       (type)(shift_value < (type)0));                                             \
+    type x = temp;                                                                                 \
+    x = MUL_SAT_OP_EXPAND(                                                                         \
+            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
+                                                                        fixed_point_position),     \
+                                                      temp, stype, size, fixed_point_position)),   \
+            stype, size, fixed_point_position) >>                                                  \
+        1;                                                                                         \
+    x = MUL_SAT_OP_EXPAND(                                                                         \
+            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
+                                                                        fixed_point_position),     \
+                                                      temp, stype, size, fixed_point_position)),   \
+            stype, size, fixed_point_position) >>                                                  \
+        1;                                                                                         \
+    x = MUL_SAT_OP_EXPAND(                                                                         \
+            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
+                                                                        fixed_point_position),     \
+                                                      temp, stype, size, fixed_point_position)),   \
+            stype, size, fixed_point_position) >>                                                  \
+        1;                                                                                         \
+    if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */                  \
+    {                                                                                              \
+      x = MUL_SAT_OP_EXPAND(                                                                       \
+              x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,       \
+                                                                          fixed_point_position),   \
+                                                        temp, stype, size, fixed_point_position)), \
+              stype, size, fixed_point_position) >>                                                \
+          1;                                                                                       \
+      x = MUL_SAT_OP_EXPAND(                                                                       \
+              x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,       \
+                                                                          fixed_point_position),   \
+                                                        temp, stype, size, fixed_point_position)), \
+              stype, size, fixed_point_position) >>                                                \
+          1;                                                                                       \
+    }                                                                                              \
+    type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0);      \
+    return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2),  \
+                                                    (type)(clz(x) > shift_value2)),                \
+                  (type)(shift_value < (type)0)); /* Saturate result if needed */                  \
+  }
+
+INVSQRTQ_IMPL(qs8, qs8x1, 1)
+INVSQRTQ_IMPL(qs16, qs16x1, 1)
+INVSQRTQ_IMPL(qs8, qs8x16, 16)
+INVSQRTQ_IMPL(qs16, qs16x8, 8)
+
+#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
+#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate hyperbolic tangent of a fixed point vector
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of
+ * overflow
+ */
+#define TANHQ_IMPL(stype, type, size)                                                        \
+  inline type tanh_sat_##type(type VopA, int fixed_point_position)                           \
+  {                                                                                          \
+    type const_one = (type)(1 << (fixed_point_position));                                    \
+    type const_two = (type)(2 << (fixed_point_position));                                    \
+    type exp2x =                                                                             \
+        EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \
+                      stype, size, fixed_point_position);                                    \
+    type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size);                             \
+    type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size);                             \
+    return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position);               \
+  }
+
+TANHQ_IMPL(qs8, qs8x16, 16)
+TANHQ_IMPL(qs16, qs16x8, 8)
+
+#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
+#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
+
+#define floatx16 float16
+#define float16_TYPE float16
+
+#define CONVERTQ_DOWN_IMPL(in_type, out_type)                                           \
+  inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position)   \
+  {                                                                                     \
+    return CONVERT(a * (1 << fixed_point_position) +                                    \
+                       select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
+                   out_type);                                                           \
+  }
+
+CONVERTQ_DOWN_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_IMPL(float16, qs16x16)
+
+#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type)                                           \
+  inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
+  {                                                                                         \
+    return CONVERT_SAT(a * (1 << fixed_point_position) +                                    \
+                           select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
+                       out_type);                                                           \
+  }
+
+CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
+
+#define CONVERTQ_UP_IMPL(in_type, out_type)                                           \
+  inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
+  {                                                                                   \
+    return CONVERT(a, out_type) / (1 << fixed_point_position);                        \
+  }
+
+CONVERTQ_UP_IMPL(qs8x16, float16)
+CONVERTQ_UP_IMPL(qs16x16, float16)
+
+#define SQCVT_SAT_IMPL(type)                                                                \
+  inline type sqcvt_##type##_sat(float a, int fixed_point_position)                         \
+  {                                                                                         \
+    return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
+  }
+
+SQCVT_SAT_IMPL(qs8)
+SQCVT_SAT_IMPL(qs16)
+
+#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
+#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
+
+#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
new file mode 100644
index 000000000..25e20f5f2
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform gather
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input1_ptr                            Pointer to the first source tensor. Supported data types: U8/S32/F32
+ * @param[in]  input1_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  input2_ptr                            Pointer to the first source tensor. Supported data types: U32
+ * @param[in]  input2_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gather(IMAGE_DECLARATION(input1),
+                    VECTOR_DECLARATION(input2),
+                    IMAGE_DECLARATION(output))
+{
+    Image in1  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
+    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
+    Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
+    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
+
+    //TODO: performance tuning for memcopy
+    int index = in2_data.s0;
+    int stride=input1_stride_y/input1_stride_x;
+
+    for(int i=0; i<stride; i++){
+        *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index));
+    }
+}
+
+__kernel void gather_1d_out(IMAGE_DECLARATION(input1),
+                    VECTOR_DECLARATION(input2),
+                    VECTOR_DECLARATION(output))
+{
+    Image in1  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
+    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
+    Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
+    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
+
+    //TODO: performance tuning for memcopy
+    int index = in2_data.s0;
+    int stride=input1_stride_y/input1_stride_x;
+
+    for(int i=0; i<stride; i++){
+        *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index));
+    }
+}
+
+__kernel void gather_1d(VECTOR_DECLARATION(input1),
+                    VECTOR_DECLARATION(input2),
+                    VECTOR_DECLARATION(output))
+{
+    Vector in1  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1);
+    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
+    Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
+    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
+
+    //TODO: performance tuning for memcopy
+    int index = in2_data.s0;
+    *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 000000000..8143d2398
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+#if defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+#define EXPAND(x) x
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)                                        \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
+      uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                               \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                            \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
+      uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                            \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
+      uint name##_step_w, uint name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
+  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                             name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                          \
+  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                            name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)                                                     \
+  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                            name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+                                          name##_stride_x, name##_step_x, name##_stride_y,  \
+                                          name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                             \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes,        \
+                                          name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
+                                          name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+                                          name##_stride_x, name##_step_x, name##_stride_y,  \
+                                          name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                          \
+  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                               name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                  \
+  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                \
+  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                               name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                        \
+  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0,     \
+                               mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+  int stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+                                         uint stride_x, uint step_x)
+{
+  Vector vector = {
+      .ptr = ptr,
+      .offset_first_element_in_bytes = offset_first_element_in_bytes,
+      .stride_x = stride_x,
+  };
+  vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+  return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+                                       uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+  Image img = {.ptr = ptr,
+               .offset_first_element_in_bytes = offset_first_element_in_bytes,
+               .stride_x = stride_x,
+               .stride_y = stride_y};
+  img.ptr +=
+      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+  return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint offset_first_element_in_bytes,
+                                                     uint stride_x, uint step_x, uint stride_y,
+                                                     uint step_y, uint stride_z, uint step_z)
+{
+  Image img = {.ptr = ptr,
+               .offset_first_element_in_bytes = offset_first_element_in_bytes,
+               .stride_x = stride_x,
+               .stride_y = stride_y};
+  img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
+             get_global_id(1) * step_y + get_global_id(2) * step_z;
+  return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint offset_first_element_in_bytes, uint stride_x,
+                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
+                                             uint step_z)
+{
+  Tensor3D tensor = {.ptr = ptr,
+                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                     .stride_x = stride_x,
+                     .stride_y = stride_y,
+                     .stride_z = stride_z};
+  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+                get_global_id(1) * step_y + get_global_id(2) * step_z;
+  return tensor;
+}
+
+Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint offset_first_element_in_bytes, uint stride_x,
+                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
+                                             uint step_z, uint stride_w, uint step_w, uint mod_size)
+{
+  Tensor4D tensor = {.ptr = ptr,
+                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                     .stride_x = stride_x,
+                     .stride_y = stride_y,
+                     .stride_z = stride_z,
+                     .stride_w = stride_w};
+
+  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+                get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
+                (get_global_id(2) / mod_size) * step_w;
+  return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+__global inline const uchar *vector_offset(const Vector *vec, int x)
+{
+  return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+__global inline uchar *offset(const Image *img, int x, int y)
+{
+  return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+         w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
new file mode 100644
index 000000000..c39138caa
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+#include "helpers.h"
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
+  inline VEC_DATA_TYPE(int, size)                                                    \
+      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                  \
+    VEC_DATA_TYPE(int, size)                                                         \
+    mask = (1 << exponent) - 1;                                                      \
+    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
+    const VEC_DATA_TYPE(int, size) one = 1;                                          \
+    VEC_DATA_TYPE(int, size)                                                         \
+    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
+  }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size)                                              \
+      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                            \
+    VEC_DATA_TYPE(int, size)                                                   \
+    overflow = a == b && a == INT_MIN;                                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    a_64 = convert_long##size(a);                                              \
+    VEC_DATA_TYPE(long, size)                                                  \
+    b_64 = convert_long##size(b);                                              \
+    VEC_DATA_TYPE(long, size)                                                  \
+    ab_64 = a_64 * b_64;                                                       \
+    /* COMPMID-907 */                                                          \
+    VEC_DATA_TYPE(int, size)                                                   \
+    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
+    return select(ab_x2_high32, INT_MAX, overflow);                            \
+  }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
+  inline VEC_DATA_TYPE(int, size)                                                                  \
+      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+                                                                              a)                   \
+  {                                                                                                \
+    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
+    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
+    const int k_fractional_bits = 31;                                                              \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x = a + (1 << (k_fractional_bits - 3));                                                        \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x2 = ASYMM_MULT(x, x, size);                                                                   \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x3 = ASYMM_MULT(x2, x, size);                                                                  \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
+        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
+        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
+    return constant_term +                                                                         \
+           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
+  }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
+ * bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                         \
+  inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask,  \
+                                                                VEC_DATA_TYPE(int, size) then_val, \
+                                                                VEC_DATA_TYPE(int, size) else_val) \
+  {                                                                                                \
+    return (if_mask & then_val) ^ (~if_mask & else_val);                                           \
+  }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                                    \
+    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                      \
+    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                      \
+    return select(all_zeros, all_ones, a == 0);                                        \
+  }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                                        \
+    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                          \
+    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                          \
+    return select(all_zeros, all_ones, a != 0);                                            \
+  }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
+  inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
+      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
+      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
+  {                                                                                            \
+    if (k_integer_bits > exponent)                                                             \
+    {                                                                                          \
+      const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
+      return ASYMM_SELECT_USING_MASK(                                                          \
+          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
+          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
+    }                                                                                          \
+                                                                                               \
+    return result;                                                                             \
+  }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
+  {                                                                                               \
+    const int k_fractional_bits = 31 - k_integer_bits;                                            \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    k_one_quarter = 1 << (k_fractional_bits - 2);                                                 \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    mask = k_one_quarter - 1;                                                                     \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                 \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
+        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    remainder = a_mod_quarter_minus_one_quarter - a;                                              \
+                                                                                                  \
+    result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits,        \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits,        \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits,         \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits,         \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
+                                size);                                                            \
+    result =                                                                                      \
+        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
+                                                                                                  \
+    if (k_integer_bits > 5)                                                                       \
+    {                                                                                             \
+      const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                     \
+      result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
+    }                                                                                             \
+                                                                                                  \
+    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                              \
+    return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);            \
+  }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
+  inline VEC_DATA_TYPE(int, size)                                                            \
+      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                          \
+    if (exponent < 0)                                                                        \
+    {                                                                                        \
+      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
+    }                                                                                        \
+                                                                                             \
+    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
+    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
+    int threshold = ((1 << (31 - exponent)) - 1);                                            \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    result = x << exponent;                                                                  \
+    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
+    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
+    return result;                                                                           \
+  }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size)                                                           \
+      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                                         \
+    VEC_DATA_TYPE(long, size)                                                               \
+    a64 = convert_long##size(a);                                                            \
+    VEC_DATA_TYPE(long, size)                                                               \
+    b64 = convert_long##size(b);                                                            \
+    VEC_DATA_TYPE(long, size)                                                               \
+    sum = a64 + b64;                                                                        \
+    const VEC_DATA_TYPE(long, size) one = 1;                                                \
+    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
+    VEC_DATA_TYPE(long, size)                                                               \
+    sign = select(minus_one, one, sum >= 0);                                                \
+    return convert_int##size((sum + sign) / 2);                                             \
+  }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
+  inline VEC_DATA_TYPE(int, size)                                              \
+      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                            \
+    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
+    const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
+    VEC_DATA_TYPE(int, size)                                                   \
+    half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);               \
+    const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810;                 \
+    const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;            \
+    VEC_DATA_TYPE(int, size)                                                   \
+    x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
+    for (int i = 0; i < 3; i++)                                                \
+    {                                                                          \
+      VEC_DATA_TYPE(int, size)                                                 \
+      half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);        \
+      VEC_DATA_TYPE(int, size)                                                 \
+      one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;  \
+      VEC_DATA_TYPE(int, size)                                                 \
+      tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);           \
+      x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);            \
+    }                                                                          \
+    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                 \
+  }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value
+ * accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value,             \
+                                                      int src_integer_bits, int dst_integer_bits) \
+  {                                                                                               \
+    int exponent = src_integer_bits - dst_integer_bits;                                           \
+    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
+  }
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+  asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+  ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+  asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+  asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+                           remainder, size)                                                    \
+  exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+                           remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
+  asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
+  asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+  asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+  asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+\ No newline at end of file
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
new file mode 100644
index 000000000..512c62023
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+/** Performs a pixelwise division with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_div_float(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform division
+#ifdef DATA_TYPE_FLOAT
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else  /* DATA_TYPE_FLOAT */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+#endif /* DATA_TYPE_FLOAT */
+
+    // Store result
+    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
new file mode 100644
index 000000000..82edf3b1d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+
+#if defined(SATURATE)
+#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#else // SATURATE
+#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#endif // SATURATE
+
+#else // FIXED_POINT_POSITION
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
+
+#endif // FIXED_POINT_POSITION
+
+/** Performs a pixelwise division with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
+ */
+__kernel void pixelwise_div_int(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const uint scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform division and store result
+    vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
new file mode 100644
index 000000000..ddc9d5a27
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to inputs
+ *  -# Multiply inputs
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_qasymm8(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(int, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+    VEC_DATA_TYPE(int, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+    // Perform multiplication of two inputs
+    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+    VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
+
+    // Multiply with a multiplier smaller than 1
+    out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+// TODO: Apply min-max BOUND to support fuse with relu.
+/*
+#if defined(MIN_BOUND)
+    res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+*/
+
+    // Store result
+    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
+                     0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
new file mode 100644
index 000000000..dfa3b85f4
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH)
+/** Perform reduce max
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types:  F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[out] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[out] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reduce_max(VECTOR_DECLARATION(input),
+                         VECTOR_DECLARATION(output))
+{
+    Vector input = CONVERT_TO_VECTOR_STRUCT(input);
+    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+    __global float *input_addr = (__global float *)(input.ptr);
+    __global float *output_addr = (__global float *)(output.ptr);
+
+    float max_value = *input_addr;
+    for(int x = 1; x < WIDTH; x++)
+    {
+        float value = *(input_addr + x);
+        max_value = max(value, max_value);
+    }
+
+    // Store max
+    *output_addr = max_value;
+}
+#endif // defined(WIDTH)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
new file mode 100644
index 000000000..1a96eea61
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+inline DATA_TYPE sum_8(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in = vload8(0, input);
+    in.s0123 += in.s4567;
+    in.s01 += in.s23;
+    return ((in.s0 + in.s1));
+}
+
+/** This function calculates the sum and sum of squares of a given input image.
+ *
+ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] local_sum                         Local sum of all elements
+ * @param[in]  height                            Height of the input image
+ * @param[in]  divider                           Divider to calculate mean
+ */
+__kernel void reduction_mean(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __local DATA_TYPE *local_sums, 
+    int height,
+    int divider)
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    float8 tmp_sum = 0;
+    // Calculate partial sum
+
+    for(int i = 0; i < height; i++)
+    {
+        local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i));
+    }
+    ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider;
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
new file mode 100644
index 000000000..c5ff82f9e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+
+inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w)
+{
+    int stride_x = vector->stride_x;
+    int stride_y = stride_x * dim_x;
+    int stride_z = stride_y * dim_y;
+    int stride_w = stride_z * dim_z;
+    Tensor4D tensor =
+    {
+        .ptr                           = vector->ptr,
+        .offset_first_element_in_bytes = vector->offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w,
+    };
+    return tensor;
+}
+
+/** Extracts a strided slice up to 4-dimensions
+ *
+ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
+ * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dims_in                              The 4-dimensional dimension of the input. Supported data types: S32
+ * @param[in]  dims_out                             The 4-dimensional dimension of the output. Supported data types: S32
+ * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
+ * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
+ */
+__kernel void strided_slice(VECTOR_DECLARATION(input),
+                            VECTOR_DECLARATION(output),
+                            const int4 dims_in,
+                            const int4 dims_out,
+                            const int4 starts,
+                            const int4 strides)
+{
+    // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset
+    Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
+    Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+    // Implemenation
+    // Infer a Tensor4D from output Vector and output's dimensions info
+    // Infer a Tensor4D from input Vector and input's dimensions info
+    // Infer indices of output as 4D from the offset of output vector
+    // Infer indices of input as 4D from indices of output
+    // out(offset of output vector) = in(offset of input)
+
+    Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
+    Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
+
+    // Must be output_step_x == output_stride_x == an element's size
+    const int offset_out = get_global_id(0) * output_stride_x;
+    int4 indices_out =
+    {
+            get_global_id(0) % dims_out.x,
+            (offset_out / tensor_out.stride_y) % dims_out.y,
+            (offset_out / tensor_out.stride_z) % dims_out.z,
+            (offset_out / tensor_out.stride_w) % dims_out.w,
+    };
+
+    int4 indices_in =
+    {
+            starts.x + (strides.x * indices_out.x),
+            starts.y + (strides.y * indices_out.y),
+            starts.z + (strides.z * indices_out.z),
+            starts.w + (strides.w * indices_out.w),
+    };
+
+    *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
new file mode 100644
index 000000000..0b0cf8218
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+__kernel void topkv2_init(VECTOR_DECLARATION(input),
+    __global float* in_key_buf,
+    __global int* in_ind_buf,
+    const int n)
+{
+  int gid = get_global_id(0);
+  int lws = get_local_size(0);
+  int groups = get_num_groups(0);
+  int gws = lws * groups;
+  int iter = n / gws;
+
+  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+  for(int i = 0; i < iter; ++i)
+  {
+    int idx = i * gws + gid;
+    in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x);
+    in_ind_buf[idx] = idx;
+  }
+}
+
+__kernel void topkv2_find_first_negative(
+    __global float *out_key_buf,
+    __global int *first_negative_idx,
+    int n)
+{
+  int gid = get_global_id(0);
+
+  if( gid == n - 1 )
+  {
+    // if the last item is positive, the first negative index is n.
+    if( out_key_buf[gid] > 0.f )
+      *first_negative_idx = n;
+  } else if ( gid == 0 ) {
+    // if the first item is negative, set it 0.
+    if( out_key_buf[gid] < 0.f )
+      *first_negative_idx = 0;
+  } else {
+    // if its left is positive and it is negative, then it is the first negative item.
+    if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f )
+      *first_negative_idx = gid;
+  }
+}
+
+__kernel void topkv2_reorder_negatives(
+    __global float* in_key_buf,
+    __global float* out_key_buf,
+    __global float* in_ind_buf,
+    __global float* out_ind_buf,
+    __global int* first_negative_idx,
+    int n)
+{
+  int gid = get_global_id(0);
+
+  int num_negs = n - *first_negative_idx;
+  int in_idx;
+
+  if( gid < num_negs ) {
+    in_idx = n - 1 - gid;
+  } else {
+    in_idx = gid - num_negs;
+  }
+
+  out_key_buf[gid] = in_key_buf[in_idx];
+  out_ind_buf[gid] = in_ind_buf[in_idx];
+}
+
+__kernel void topkv2_store(
+    VECTOR_DECLARATION(values),
+    VECTOR_DECLARATION(indices),
+    __global float *out_key_buf,
+    __global int *out_ind_buf,
+    int n)
+{
+  int gid = get_global_id(0);
+
+  Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
+  Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+
+  int idx = n - 1 - gid;
+
+  *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
+  *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
new file mode 100644
index 000000000..deadf8412
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+__global inline float* get_vec_elem(Vector* vec, int idx)
+{
+  return (__global float*)(vec->ptr + idx * vec->stride_x);
+}
+
+__global inline int* get_vec_elem_int(Vector* vec, int idx)
+{
+  return (__global int*)(vec->ptr + idx * vec->stride_x);
+}
+
+// A utility function to swap two elements
+void swap(__global float *a, __global float *b)
+{
+    float t = *a;
+    *a = *b;
+    *b = t;
+}
+
+void swap_idx(__global int *a, __global int *b)
+{
+    int t = *a;
+    *a = *b;
+    *b = t;
+}
+
+/* This function is same in both iterative and recursive*/
+int partition (Vector* arr, __global int* indices, int l, int h)
+{
+    float x = *get_vec_elem(arr, h);
+    int i = (l - 1);
+
+    for (int j = l; j <= h- 1; j++)
+    {
+        if (*get_vec_elem(arr, j) >= x)
+        {
+            i++;
+            swap (get_vec_elem(arr,i), get_vec_elem(arr,j));
+            swap_idx(&indices[i], &indices[j]);
+        }
+    }
+    swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
+    swap_idx(&indices[i + 1], &indices[h]);
+    return (i + 1);
+}
+
+/* A[] --> Array to be sorted,
+   l  --> Starting index,
+   h  --> Ending index */
+void quickSortIterative (Vector* arr, __global int* indices, 
+    __global int *stack, int l, int h)
+{
+    // Create an auxiliary stack
+
+    // initialize top of stack
+    int top = -1;
+
+    // push initial values of l and h to stack
+    stack[ ++top ] = l;
+    stack[ ++top ] = h;
+
+    // Keep popping from stack while is not empty
+    while ( top >= 0 )
+    {
+        // Pop h and l
+        h = stack[ top-- ];
+        l = stack[ top-- ];
+
+        // Set pivot element at its correct position
+        // in sorted array
+        int p = partition( arr, indices, l, h );
+
+        // If there are elements on left side of pivot,
+        // then push left side to stack
+        if ( p-1 > l )
+        {
+            stack[ ++top ] = l;
+            stack[ ++top ] = p - 1;
+        }
+
+        // If there are elements on right side of pivot,
+        // then push right side to stack
+        if ( p+1 < h )
+        {
+            stack[ ++top ] = p + 1;
+            stack[ ++top ] = h;
+        }
+    }
+}
+
+__kernel void topkv2_quicksort(VECTOR_DECLARATION(input),
+    VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices),
+    __global int* indices, __global int* temp_stack, int k, int n)
+{
+  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+  Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
+  Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
+
+  for( int i = 0; i < n; ++i )
+  {
+    indices[i] = i;
+  }
+
+  quickSortIterative(&input, indices, temp_stack, 0, n-1);
+
+  // extract k items.
+  for(int i = 0; i < k; ++i)
+  {
+    *get_vec_elem(&topk_values, i)  = *get_vec_elem(&input, i);
+    *get_vec_elem_int(&topk_indices, i) = indices[i];
+  }
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
new file mode 100644
index 000000000..cac0c071e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// reference:
+// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
+// OpenCL kernel sources for the CLRadixSort class
+// the #include does not exist in OpenCL
+// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
+// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
+// if you find this software usefull you can cite the following work in your reports or articles:
+// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
+// http://hal.archives-ouvertes.fr/hal-00596730
+
+// Reference for floating point radix sort:
+// http://www.codercorner.com/RadixSortRevisited.htm
+
+// compute the histogram for each radix and each virtual processor for the pass
+__kernel void radixsort_histogram(__global float* in_key_buf,
+      __global int* d_Histograms,
+      const int pass,
+      __local int* loc_histo,
+      const int n)
+{
+  int it = get_local_id(0);  // i local number of the processor
+  int ig = get_global_id(0); // global number = i + g I
+
+  int gr = get_group_id(0); // g group number
+
+  int groups = get_num_groups(0);
+  int items  = get_local_size(0);
+
+  // set the local histograms to zero
+  for(int ir=0;ir<_RADIX;ir++){
+    loc_histo[ir * items + it] = 0;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // range of keys that are analyzed by the work item
+  int size= n/groups/items; // size of the sub-list
+  int start= ig * size; // beginning of the sub-list
+
+  unsigned int key;
+  int shortkey,k;
+
+  // compute the index
+  // the computation depends on the transposition
+  for(int j = 0; j < size ; j++) {
+#ifdef TRANSPOSE
+    k= groups * items * j + ig;
+#else
+    k=j+start;
+#endif
+
+    key = *((__global unsigned int*)(in_key_buf + k));
+
+    // extract the group of _BITS bits of the pass
+    // the result is in the range 0.._RADIX-1
+    shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
+
+    // increment the local histogram
+    loc_histo[shortkey *  items + it ]++;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // copy the local histogram to the global one
+  for(int ir=0;ir<_RADIX;ir++) {
+    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// initial transpose of the list for improving
+// coalescent memory access
+__kernel void transpose(const __global int* invect,
+      __global int* outvect,
+      const int nbcol,
+      const int nbrow,
+      const __global int* inperm,
+      __global int* outperm,
+      __local int* blockmat,
+      __local int* blockperm,
+      const int tilesize){
+
+  int i0 = get_global_id(0)*tilesize;  // first row index
+  int j = get_global_id(1);  // column index
+
+  int jloc = get_local_id(1);  // local column index
+
+  // fill the cache
+  for(int iloc=0;iloc<tilesize;iloc++){
+    int k=(i0+iloc)*nbcol+j;  // position in the matrix
+    blockmat[iloc*tilesize+jloc]=invect[k];
+#ifdef PERMUT
+    blockperm[iloc*tilesize+jloc]=inperm[k];
+#endif
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // first row index in the transpose
+  int j0=get_group_id(1)*tilesize;
+
+  // put the cache at the good place
+  for(int iloc=0;iloc<tilesize;iloc++){
+    int kt=(j0+iloc)*nbrow+i0+jloc;  // position in the transpose
+    outvect[kt]=blockmat[jloc*tilesize+iloc];
+#ifdef PERMUT
+      outperm[kt]=blockperm[jloc*tilesize+iloc];
+#endif
+  }
+
+}
+
+// each virtual processor reorders its data using the scanned histogram
+__kernel void radixsort_reorder(__global float* in_key,
+    __global float* out_key,
+    __global int* d_Histograms,
+    const int pass,
+    __global int* indices_in,
+    __global int* indices_out,
+    __local int* loc_histo,
+    const int n){
+
+  int it = get_local_id(0);
+  int ig = get_global_id(0);
+
+  int gr = get_group_id(0);
+  int groups=get_num_groups(0);
+  int items=get_local_size(0);
+
+  int start= ig *(n/groups/items);
+  int size= n/groups/items;
+
+  // take the histogram in the cache
+  for(int ir=0;ir<_RADIX;ir++){
+    loc_histo[ir * items + it]=
+      d_Histograms[items * (ir * groups + gr) + it];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  int newpos,shortkey,k,newpost;
+  unsigned int key;
+
+  for(int j= 0; j< size;j++){
+#ifdef TRANSPOSE
+      k= groups * items * j + ig;
+#else
+      k=j+start;
+#endif
+    float org_value = in_key[k];
+    key = *(__global unsigned int*)(in_key + k);
+    shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
+
+    newpos=loc_histo[shortkey * items + it];
+
+#ifdef TRANSPOSE
+    int ignew,jnew;
+    ignew= newpos/(n/groups/items);
+    jnew = newpos%(n/groups/items);
+    newpost = jnew * (groups*items) + ignew;
+#else
+    newpost=newpos;
+#endif
+
+    //d_outKeys[newpost]= key;  // killing line !!!
+    out_key[newpost] = org_value;
+
+#ifdef PERMUT
+    indices_out[newpost] = indices_in[k];
+#endif
+
+    newpos++;
+    loc_histo[shortkey * items + it]=newpos;
+  }
+}
+
+// perform a parallel prefix sum (a scan) on the local histograms
+// (see Blelloch 1990) each workitem worries about two memories
+// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
+{
+  int it = get_local_id(0);
+  int ig = get_global_id(0);
+  int decale = 1;
+  int n=get_local_size(0) * 2 ;
+  int gr=get_group_id(0);
+
+  // load input into local memory
+  // up sweep phase
+  temp[2*it] = histo[2*ig];
+  temp[2*it+1] = histo[2*ig+1];
+
+  // parallel prefix sum (algorithm of Blelloch 1990)
+  for (int d = n>>1; d > 0; d >>= 1){
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (it < d){
+      int ai = decale*(2*it+1)-1;
+      int bi = decale*(2*it+2)-1;
+      temp[bi] += temp[ai];
+    }
+    decale *= 2;
+  }
+
+  // store the last element in the global sum vector
+  // (maybe used in the next step for constructing the global scan)
+  // clear the last element
+  if (it == 0) {
+    globsum[gr]=temp[n-1];
+    temp[n - 1] = 0;
+  }
+
+  // down sweep phase
+  for (int d = 1; d < n; d *= 2){
+    decale >>= 1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (it < d){
+      int ai = decale*(2*it+1)-1;
+      int bi = decale*(2*it+2)-1;
+
+      int t = temp[ai];
+      temp[ai] = temp[bi];
+      temp[bi] += t;
+    }
+
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // write results to device memory
+
+  histo[2*ig] = temp[2*it];
+  histo[2*ig+1] = temp[2*it+1];
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+}
+
+// use the global sum for updating the local histograms
+// each work item updates two values
+__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
+{
+  int ig = get_global_id(0);
+  int gr=get_group_id(0);
+
+  int s;
+
+  s=globsum[gr];
+
+  // write results to device memory
+  histo[2*ig] += s;
+  histo[2*ig+1] += s;
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
new file mode 100644
index 000000000..b019e8c33
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  // Create kernel
+  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+  {
+    const float scale_in = input->info()->quantization_info().scale;
+    const int offset_in = input->info()->quantization_info().offset;
+    build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
+    build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+
+    _kernel = static_cast<cl::Kernel>(
+        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
+  }
+  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+  {
+    const float scale_in = output->info()->quantization_info().scale;
+    const int offset_in = output->info()->quantization_info().offset;
+    build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
+    build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+
+    _kernel = static_cast<cl::Kernel>(
+        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
+  }
+  else
+  {
+    _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts));
+  }
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure(win);
+}
+
+void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
new file mode 100644
index 000000000..23efafa6a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
+                                                       DataType::F32);
+
+  return Status{};
+}
+
+} // namespace
+
+CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {}
+
+void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Construct kernel name
+  std::string kernel_name = "gather";
+  if (input1->info()->num_dimensions() == 1)
+  {
+    kernel_name = "gather_1d";
+  }
+  else if (input1->info()->num_dimensions() == 2)
+  {
+    if (_output->info()->num_dimensions() == 1)
+    {
+      kernel_name = "gather_1d_out";
+    }
+  }
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure kernel window
+  const unsigned int num_elems_processed_per_iteration = 1;
+  Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  ICLKernel::configure(win);
+}
+
+Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                const ITensorInfo *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+
+  return Status{};
+}
+
+void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  if (_input1->info()->num_dimensions() == 1)
+  {
+    Window slice = window.first_slice_window_1D();
+
+    unsigned int idx = 0;
+    add_1D_tensor_argument(idx, _input1, slice);
+    add_1D_tensor_argument(idx, _input2, slice);
+    add_1D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+  }
+  else if (_input1->info()->num_dimensions() == 2)
+  {
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+    Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+
+    // Set inputs
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input1, window_collapsed);
+    add_1D_tensor_argument(idx, _input2, slice);
+    if (_output->info()->num_dimensions() == 1)
+    {
+      add_1D_tensor_argument(idx, _output, slice);
+    }
+    else
+    {
+      add_2D_tensor_argument(idx, _output, window_collapsed);
+    }
+    enqueue(queue, *this, slice);
+  }
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
new file mode 100644
index 000000000..a3e0163de
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
+                          RoundingPolicy rounding_policy)
+{
+  ARM_COMPUTE_UNUSED(overflow_policy);
+  ARM_COMPUTE_UNUSED(rounding_policy);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8,
+                                                       DataType::QS16, DataType::S16, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8,
+                                                       DataType::QS16, DataType::S16, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
+
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+
+  if (is_data_type_fixed_point(input1->data_type()))
+  {
+    // All data types must be all QS8 or all QS16
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1,
+                                    "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
+  }
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8,
+                                                         DataType::QS16, DataType::S16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        output->data_type() == DataType::U8 &&
+            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+        "Output can only be U8 if both inputs are U8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+    if (is_data_type_fixed_point(input1->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+    }
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
+                                                        ITensorInfo *output)
+{
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output, out_shape);
+
+    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+    {
+      set_format_if_unknown(*output, Format::S16);
+    }
+    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output, Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                          ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy,
+                                          RoundingPolicy rounding_policy)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
+                                                scale, overflow_policy, rounding_policy));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  int scale_int = -1;
+  // Extract sign, exponent and mantissa
+  int exponent = 0;
+  float normalized_mantissa = std::frexp(scale, &exponent);
+  // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+  // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <=
+  // 14
+  // Moreover, it will be negative as we deal with 1/2^n
+  if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+  {
+    // Store the positive exponent. We know that we compute 1/2^n
+    // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+    scale_int = std::abs(exponent - 1);
+  }
+
+  std::string data_type;
+  std::string compute_type;
+  // Check if it has float inputs and output
+  if (is_data_type_float(input1->info()->data_type()) ||
+      is_data_type_float(input2->info()->data_type()))
+  {
+    scale_int = -1;
+    compute_type = (input1->info()->data_type() == DataType::F32 ||
+                    input2->info()->data_type() == DataType::F32)
+                       ? "float"
+                       : "half";
+    data_type = "DATA_TYPE_FLOAT";
+  }
+  else
+  {
+    if (input1->info()->data_type() == DataType::S16 ||
+        input2->info()->data_type() == DataType::S16)
+    {
+      compute_type = "int";
+    }
+    else if (input1->info()->data_type() == DataType::QS8)
+    {
+      compute_type = "qs8";
+    }
+    else if (input1->info()->data_type() == DataType::QS16)
+    {
+      compute_type = "qs16";
+    }
+    else
+    {
+      compute_type = "ushort";
+    }
+    data_type = "DATA_TYPE_INT";
+  }
+
+  // Construct kernel name
+  std::string kernel_name = "pixelwise_div";
+  kernel_name += (scale_int >= 0) ? "_int" : "_float";
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace(
+      (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()))
+          ? "-DWRAP"
+          : "-DSATURATE");
+  build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
+                                                                  : "-DROUND=_rte");
+  if (is_data_type_fixed_point(input1->info()->data_type()))
+  {
+    build_opts.emplace("-DFIXED_POINT_POSITION=" +
+                       support::cpp11::to_string(input1->info()->fixed_point_position()));
+  }
+  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+  build_opts.emplace("-D" + data_type);
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Set scale argument
+  unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
+
+  if (scale_int >= 0)
+  {
+    _kernel.setArg(idx++, scale_int);
+  }
+  else
+  {
+    _kernel.setArg(idx++, scale);
+  }
+
+  ICLKernel::configure(win_config.second);
+}
+
+Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                           const ITensorInfo *output, float scale,
+                                           ConvertPolicy overflow_policy,
+                                           RoundingPolicy rounding_policy)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
+                                                            input2->clone().get(),
+                                                            output->clone().get())
+                                  .first);
+
+  return Status{};
+}
+
+void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPixelWiseDivisionKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
new file mode 100644
index 000000000..168b246bf
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+{
+  // We can handle for simple case only
+  // Input rank: 2
+  // Output rank: 1
+  // Axis: one axis value, restrict to 1
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
+                                    "Output same type allowed for input and output");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
+                                    "Only support for output dimension 1");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
+                                    "Only support for input dimension 2");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
+
+void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
+
+  _input = input;
+  _output = output;
+  _axis = axis;
+
+  // Configure kernel window
+  int cols = _input->info()->tensor_shape()[0];
+  int rows = _input->info()->tensor_shape()[1];
+  Window win;
+  win.set(0, Window::Dimension(0, cols, 1));
+  win.set(1, Window::Dimension(0, rows, 1));
+
+  // Construct kernel name
+  std::string kernel_name = "reduce_max";
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  ICLKernel::configure(win);
+}
+
+Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
+                                   const ITensorInfo *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
+
+  return Status{};
+}
+
+void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window window_input = window;
+  Window slice_input = window_input.first_slice_window_1D();
+
+  do
+  {
+    Window slice_output = slice_input.shift_dimensions(1);
+    unsigned int idx = 0;
+    add_1D_tensor_argument(idx, _input, slice_input);
+    add_1D_tensor_argument(idx, _output, slice_output);
+    enqueue(queue, *this, slice_input);
+
+  } while (window_input.slide_window_slice_1D(slice_input));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
new file mode 100644
index 000000000..84a77122d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          std::vector<uint32_t> axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions,
+                                  "Reduction axis greater than max number of dimensions");
+
+  std::vector<uint32_t>::const_iterator it;
+  bool axis_w = false;
+  bool axis_h = false;
+  for (it = axis.begin(); it != axis.end(); ++it)
+  {
+    if ((*it) == 0)
+    {
+      axis_w = true;
+    }
+    else if ((*it) == 1)
+    {
+      axis_h = true;
+    }
+    else
+    {
+      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
+    }
+  }
+  // TODO Other axises (currently, only axises for both width and height are supported.)
+  if (!axis_w || !axis_h)
+  {
+    ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
+  }
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                         std::vector<uint32_t> axis)
+{
+  // Output tensor auto initialization if not yet initialized
+  TensorShape output_shape{input->tensor_shape()};
+  output_shape.set(0, 1);
+  output_shape.set(1, 1);
+  auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(),
+                     input->fixed_point_position());
+
+  // Configure kernel window
+  constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
+  const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
+
+  Window win = calculate_max_window(
+      *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+  AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+                                     num_elems_processed_per_iteration_y);
+  AccessWindowHorizontal output_access(output, 0, 1);
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+CLReductionMeanKernel::CLReductionMeanKernel()
+    : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
+{
+}
+
+BorderSize CLReductionMeanKernel::border_size() const { return _border_size; }
+
+void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                      std::vector<uint32_t> axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
+
+  _input = input;
+  _output = output;
+  _reduction_axis = axis;
+
+  constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
+
+  // Set border size
+  _border_size = BorderSize(
+      ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) -
+      input->info()->dimension(0));
+
+  // Set build options
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  // build_opts.emplace(("-DVEC_SIZE=" +
+  // support::cpp11::to_string(num_elems_processed_per_iteration)));
+  if (is_data_type_fixed_point(input->info()->data_type()))
+  {
+    build_opts.emplace("-DFIXED_POINT_POSITION=" +
+                       support::cpp11::to_string(input->info()->fixed_point_position()));
+  }
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  ICLKernel::configure(std::get<1>(win_config));
+}
+
+Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                       std::vector<uint32_t> axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
+      validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+
+  return Status{};
+}
+
+void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  // Set out window
+  Window out_window(window);
+  out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  // Get first input and output slices
+  Window in_slice = window.first_slice_window_2D();
+  Window out_slice = out_window.first_slice_window_2D();
+
+  // Set local sums buffer
+  // TODO work_group
+  unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
+
+  unsigned int idx = 2 * num_arguments_per_2D_tensor();
+  _kernel.setArg(idx++, local_sum_size, nullptr);
+  _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height
+  _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) *
+                                                    _input->info()->dimension(1))); // divider
+
+  do
+  {
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input, in_slice);
+    in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
+    add_2D_tensor_argument(idx, _output, out_slice);
+    enqueue(queue, *this, in_slice);
+  } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
new file mode 100644
index 000000000..80ffd423a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace std;
+using namespace arm_compute;
+
+static const int32_t maxDim = 4;
+
+CLStridedSliceKernel::CLStridedSliceKernel()
+    : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
+      _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
+{
+}
+
+Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                      const ITensorInfo *begin, const ITensorInfo *end,
+                                      const ITensorInfo *strides, int32_t beginMask,
+                                      int32_t endMask, int32_t shrinkAxisMask)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16,
+      DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(),
+                                              strides->tensor_shape());
+
+  return Status{};
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axisSize - 1] that can be used to index
+// directly into the data.
+inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride,
+                            const TensorShape &inputShape, int32_t axis)
+{
+  // Begin with the specified index
+  int32_t start = begin;
+
+  // beginMask override
+  if (beginMask & 1 << axis)
+  {
+    if (stride > 0)
+    {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axisSize-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int32_t>::lowest();
+    }
+    else
+    {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int32_t>::max();
+    }
+  }
+
+  // Handle negative indices
+  int32_t axisSize = inputShape[axis];
+  if (start < 0)
+  {
+    start += axisSize;
+  }
+
+  // Clamping
+  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
+                           const TensorShape &inputShape, int32_t axis)
+{
+  // Begin with the specified index
+  int32_t stop = end;
+
+  // endMask override
+  if (endMask & (1 << axis))
+  {
+    if (stride > 0)
+    {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int32_t>::max();
+    }
+    else
+    {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int32_t>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int32_t axisSize = inputShape[axis];
+  if (stop < 0)
+  {
+    stop += axisSize;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (stride > 0)
+  {
+    // Forward iteration
+    stop = arm_compute::utility::clamp(stop, 0, axisSize);
+  }
+  else
+  {
+    // Backward iteration
+    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
+  }
+
+  return stop;
+}
+
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
+{
+  int32_t offset = b * shape[2] * shape[1] * shape[0];
+  offset += d * shape[1] * shape[0];
+  offset += h * shape[0];
+  offset += w;
+  return offset;
+}
+
+inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
+{
+  int32_t ret = 0;
+  if (stride > 0)
+  {
+    ret = ((stop - start - 1) / stride) + 1;
+  }
+  else
+  {
+    ret = ((stop - start + 1) / stride) + 1;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
+  return ret;
+}
+
+void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     ICLTensor *beginData, ICLTensor *endData,
+                                     ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                                     int32_t shrinkAxisMask)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
+                                      endData->info(), stridesData->info(), beginMask, endMask,
+                                      shrinkAxisMask));
+
+  _input = input;
+  _output = output;
+  _beginData = beginData;
+  _endData = endData;
+  _stridesData = stridesData;
+  _beginMask = beginMask;
+  _endMask = endMask;
+  _shrinkAxisMask = shrinkAxisMask;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DELEMENT_DATA_TYPE=" +
+                     get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts));
+
+  // Create output's window without padding
+  TensorShape collapsed = output->info()->tensor_shape();
+  collapsed.collapse(4);
+  TensorInfo info = *output->info();
+  info.set_tensor_shape(collapsed);
+  Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
+
+  ICLKernel::configure(win);
+}
+
+void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  // Create input window
+  TensorShape collapsed = _input->info()->tensor_shape();
+  collapsed.collapse(4);
+  TensorInfo info = *_input->info();
+  info.set_tensor_shape(collapsed);
+  Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
+
+  _beginData->map(queue);
+  _endData->map(queue);
+  _stridesData->map(queue);
+
+  std::vector<int32_t> dimsIn;
+  std::vector<int32_t> dimsOut;
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
+
+  for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
+  {
+    const TensorShape shape = _input->info()->tensor_shape();
+    starts.emplace_back(
+        StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
+                     reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
+
+    stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
+                                   reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape,
+                                   n));
+
+    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
+    dimsIn.emplace_back(shape[n]);
+    dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
+  }
+
+  for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
+  {
+    starts.emplace_back(0);
+    stops.emplace_back(1);
+    strides.emplace_back(1);
+    dimsIn.emplace_back(1);
+    dimsOut.emplace_back(1);
+  }
+  // TODO: Apply shrinkAxisMask
+
+  _beginData->unmap(queue);
+  _stridesData->unmap(queue);
+  _endData->unmap(queue);
+
+  // Set parameters
+  unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+  const cl_int4 dimsInArg = {{
+      static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]),
+      static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]),
+  }};
+  _kernel.setArg<cl_int4>(idx++, dimsInArg);
+
+  const cl_int4 dimsOutArg = {{
+      static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]),
+      static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]),
+  }};
+  _kernel.setArg<cl_int4>(idx++, dimsOutArg);
+
+  const cl_int4 startsArg = {{
+      static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
+      static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
+  }};
+  _kernel.setArg<cl_int4>(idx++, startsArg);
+
+  const cl_int4 stridesArg = {{
+      static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]),
+      static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]),
+  }};
+  _kernel.setArg<cl_int4>(idx++, stridesArg);
+
+  // TODO: Apply slicing output's window
+  idx = 0;
+  add_1D_tensor_argument(idx, _input, win_in);
+  add_1D_tensor_argument(idx, _output, window);
+
+  enqueue(queue, *this, window);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
new file mode 100644
index 000000000..d95b485b7
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+#include <cassert>
+
+namespace arm_compute
+{
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
+
+void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+                               cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  _input = input;
+  _topk_values = topk_values;
+  _topk_indices = topk_indices;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
+
+  unsigned int idx = 3 * num_arguments_per_1D_tensor();
+  _kernel.setArg(idx++, *indices);
+  _kernel.setArg(idx++, *temp_stack);
+  _kernel.setArg<cl_int>(idx++, k);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, 1, 1));
+  ICLKernel::configure(win);
+}
+
+void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _input, window);
+  add_1D_tensor_argument(idx, _topk_values, window);
+  add_1D_tensor_argument(idx, _topk_indices, window);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
+
+void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
+                             int n)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  _input = input;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
+
+  unsigned int idx = num_arguments_per_1D_tensor();
+  _kernel.setArg(idx++, *in_key_buf);
+  _kernel.setArg(idx++, *in_ind_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure(win);
+}
+
+void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _input, window);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This kernel makes a histogram of radix for each work item.
+CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
+
+void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
+
+  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
+
+  unsigned int idx = 1;
+  _kernel.setArg(idx++, *hist_buf);
+
+  idx = 3;
+  _kernel.setArg(idx++, loc_histo_size, nullptr);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+  ICLKernel::configure(win);
+}
+
+void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  _kernel.setArg(0, *_in_key_buf);
+  _kernel.setArg<cl_int>(2, _pass);
+
+  cl::NDRange lws = cl::NDRange(_ITEMS, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
+
+void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+  int temp_size =
+      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *hist_buf);
+  _kernel.setArg(idx++, temp_size, nullptr);
+  _kernel.setArg(idx++, *glob_sum_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+  ICLKernel::configure(win);
+}
+
+void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
+
+void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
+                                               int bits)
+{
+  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+  int temp_size =
+      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *glob_sum_buf);
+  _kernel.setArg(idx++, temp_size, nullptr);
+  _kernel.setArg(idx++, *temp_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
+  ICLKernel::configure(win);
+}
+
+void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
+
+void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *hist_buf);
+  _kernel.setArg(idx++, *glob_sum_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+  ICLKernel::configure(win);
+}
+
+void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortReorder::CLRadixSortReorder()
+    : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
+      _out_ind_buf(nullptr)
+{
+}
+
+void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
+
+  unsigned int idx = 2;
+  _kernel.setArg(idx++, *hist_buf);
+
+  idx = 6;
+  _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+  ICLKernel::configure(win);
+}
+
+void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
+  cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
+
+  _kernel.setArg(0, *_in_key_buf);
+  _kernel.setArg(1, *_out_key_buf);
+  _kernel.setArg<cl_int>(3, _pass);
+  _kernel.setArg(4, *_in_ind_buf);
+  _kernel.setArg(5, *_out_ind_buf);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
+
+void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
+
+  unsigned int idx = 1;
+  _kernel.setArg(idx++, *first_negative_idx_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure(win);
+}
+
+void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *_out_key_buf);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
+    : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
+
+  unsigned int idx = 4;
+  _kernel.setArg(idx++, *first_negative_idx_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure(win);
+}
+
+void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *_in_key_buf);
+  _kernel.setArg(idx++, *_out_key_buf);
+  _kernel.setArg(idx++, *_in_ind_buf);
+  _kernel.setArg(idx++, *_out_ind_buf);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Store::CLTopKV2Store()
+    : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
+{
+  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(k == 0);
+  ARM_COMPUTE_ERROR_ON(k > n);
+
+  _values = values;
+  _indices = indices;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
+
+  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, k, 1));
+  ICLKernel::configure(win);
+}
+
+void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
+{
+  _out_key_buf = out_key_buf;
+  _out_ind_buf = out_ind_buf;
+}
+
+void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _values, window);
+  add_1D_tensor_argument(idx, _indices, window);
+  _kernel.setArg(idx++, *_out_key_buf);
+  _kernel.setArg(idx++, *_out_ind_buf);
+
+  enqueue(queue, *this, window);
+}
+
+} // namespace arm_compute
diff --git a/libs/kernel/acl/src/Init_acl.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
index cabf079fa..e1059ab53 100644
--- a/libs/kernel/acl/src/Init_acl.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -1,11 +1,12 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,20 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
 
-#include <arm_compute/runtime/CL/CLScheduler.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+#include "support/ToolchainSupport.h"
 
-namespace nnfw {
-namespace kernel {
-namespace acl {
+using namespace arm_compute;
 
-// This will do one time initialization but can be called multiple times
-void Initialize(void)
+void CLCast::configure(ICLTensor *input, ICLTensor *output)
 {
-  arm_compute::CLScheduler::get().default_init();
+  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
 }
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 000000000..5552cbc6f
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLGather::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+  k->configure(input1, input2, output);
+  _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output)
+{
+  return CLGatherKernel::validate(input1, input2, output);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
new file mode 100644
index 000000000..e1add5e90
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                    float scale, ConvertPolicy overflow_policy,
+                                    RoundingPolicy rounding_policy)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>();
+  k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
+
+Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                     const ITensorInfo *output, float scale,
+                                     ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+  return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy,
+                                             rounding_policy);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
new file mode 100644
index 000000000..3382058db
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "support/ToolchainSupport.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
+
+#include <vector>
+#include <algorithm>
+
+#include <utility>
+
+#define REDUCE_MAX_RUN_ON_CPU 1
+
+namespace arm_compute
+{
+
+CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
+
+void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
+{
+  _axis = axis;
+
+  _input = input;
+  _output = output;
+
+  auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
+  k->configure(input, axis, output);
+  _kernel = std::move(k);
+
+  // We can handle for simple case only
+  // Output rank: 1
+  // Axis: one axis value, restrict to 1
+  ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
+  ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
+  ARM_COMPUTE_ERROR_ON(axis != 1);
+}
+
+Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
+{
+  return CLReduceMaxKernel::validate(input, axis, output);
+}
+
+void CLReduceMax::run()
+{
+#if REDUCE_MAX_RUN_ON_CPU
+  run_on_cpu();
+
+  arm_compute::CLScheduler::get().sync();
+#else
+  arm_compute::CLScheduler::get().enqueue(*_kernel);
+#endif
+}
+
+void CLReduceMax::run_on_cpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  _input->map(q);
+  _output->map(q);
+
+  // Compute by CPU for simple case
+  // Input rank: 2
+  // Output rank: 1
+  // Axis: one axis value, restrict to 1
+
+  float *input_data = (float *)_input->buffer();
+  float *output_data = (float *)_output->buffer();
+
+  std::vector<float> container_max;
+  int cols = _input->info()->tensor_shape()[0];
+  int rows = _input->info()->tensor_shape()[1];
+  container_max.resize(rows);
+
+  // Initialize as 1st element in row
+  float *input_pointer = input_data;
+  for (int i = 0; i < rows; i++)
+  {
+    container_max[i] = *input_pointer;
+    input_pointer += cols;
+  }
+
+  // Update max value in row
+  for (int i = 0; i < rows; i++)
+  {
+    float max_in_row = container_max[i];
+    for (int j = 1; j < cols; j++)
+    {
+      if (max_in_row < input_data[i * cols + j])
+      {
+        max_in_row = input_data[i * cols + j];
+      }
+    }
+    container_max[i] = max_in_row;
+  }
+
+  for (int i = 0; i < rows; i++)
+  {
+    output_data[i] = container_max[i];
+  }
+
+  _input->unmap(q);
+  _output->unmap(q);
+}
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
new file mode 100644
index 000000000..ab724e752
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {}
+
+Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                 std::vector<uint32_t> axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis));
+  return Status{};
+}
+
+void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
+{
+  _reduction_mean_kernel.configure(input, output, axis);
+  _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT,
+                                PixelValue(0));
+}
+
+void CLReductionMean::run()
+{
+  CLScheduler::get().enqueue(_fill_border_kernel);
+  CLScheduler::get().enqueue(_reduction_mean_kernel);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
new file mode 100644
index 000000000..cd576cec1
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+#include <vector>
+
+using namespace arm_compute;
+
+static const int32_t maxDims = 4;
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axisSize - 1] that can be used to index
+// directly into the data.
+inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices,
+                            std::vector<int32_t> const &strides, const TensorShape &inputShape,
+                            int32_t axis)
+{
+  // Begin with the specified index
+  int32_t start = startIndices[axis];
+
+  // beginMask override
+  if (beginMask & 1 << axis)
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axisSize-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int32_t>::lowest();
+    }
+    else
+    {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int32_t>::max();
+    }
+  }
+
+  // Handle negative indices
+  int32_t axisSize = inputShape[axis];
+  if (start < 0)
+  {
+    start += axisSize;
+  }
+
+  // Clamping
+  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
+                           std::vector<int32_t> const &strides, const TensorShape &inputShape,
+                           int32_t axis)
+{
+  // Begin with the specified index
+  int32_t stop = stopIndices[axis];
+
+  // endMask override
+  if (endMask & (1 << axis))
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int32_t>::max();
+    }
+    else
+    {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int32_t>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int32_t axisSize = inputShape[axis];
+  if (stop < 0)
+  {
+    stop += axisSize;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0)
+  {
+    // Forward iteration
+    stop = arm_compute::utility::clamp(stop, 0, axisSize);
+  }
+  else
+  {
+    // Backward iteration
+    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
+  }
+
+  return stop;
+}
+
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
+{
+  int32_t offset = b * shape[2] * shape[1] * shape[0];
+  offset += d * shape[1] * shape[0];
+  offset += h * shape[0];
+  offset += w;
+  return offset;
+}
+
+void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                               ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+                               int32_t endMask, int32_t shrinkAxisMask)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
+  _kernel = std::move(k);
+}
+
+void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                                  ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+                                  int32_t endMask, int32_t shrinkAxisMask)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(
+      input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(),
+      beginMask, endMask, shrinkAxisMask));
+
+  _input = input;
+  _output = output;
+  _beginData = beginData;
+  _endData = endData;
+  _stridesData = stridesData;
+  _beginMask = beginMask;
+  _endMask = endMask;
+  _shrinkAxisMask = shrinkAxisMask;
+}
+
+void CLStridedSliceCPU::run()
+{
+  run_on_cpu();
+
+  arm_compute::CLScheduler::get().sync();
+}
+
+inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
+{
+  if (stride > 0)
+  {
+    return ((stop - start - 1) / stride) + 1;
+  }
+  else
+  {
+    return ((stop - start + 1) / stride) + 1;
+  }
+}
+
+template <typename T>
+inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask,
+                         int32_t endMask, const std::vector<int32_t> &startIndices,
+                         const std::vector<int32_t> &stopIndices,
+                         const std::vector<int32_t> &strides, T *outputData)
+{
+  ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
+  ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
+  ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
+
+  const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
+  const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
+  const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
+  const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
+  const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
+  const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
+  const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
+  const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
+
+  // The shape of outputData may collapse in one-dimension.
+  // Therefore, it is necessary to create a shape that matches the result of the outputData.
+  TensorShape outputShape(
+      getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
+      getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
+  for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b;
+       in_b += strides[3], b++)
+  {
+    for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d;
+         in_d += strides[2], d++)
+    {
+      for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h;
+           in_h += strides[1], h++)
+      {
+        for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w;
+             in_w += strides[0], w++)
+        {
+          outputData[offset4D(outputShape, b, d, h, w)] =
+              inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
+        }
+      }
+    }
+  }
+}
+
+void CLStridedSliceCPU::run_on_cpu()
+{
+  // TODO: Support shrinkAxisMask
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  _input->map(q);
+  _output->map(q);
+  _beginData->map(q);
+  _endData->map(q);
+  _stridesData->map(q);
+
+  TensorShape inputShape = _input->info()->tensor_shape();
+  TensorShape outputShape = _output->info()->tensor_shape();
+
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
+
+  for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx)
+  {
+    starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
+    stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
+    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
+  }
+
+  for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++)
+  {
+    starts.emplace_back(0);
+    stops.emplace_back(1);
+    strides.emplace_back(1);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::U8:
+    case DataType::QASYMM8:
+      StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<uint8_t *>(_output->buffer()));
+      break;
+    case DataType::S8:
+    case DataType::QS8:
+      StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
+      break;
+    case DataType::U16:
+      StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<uint16_t *>(_output->buffer()));
+      break;
+    case DataType::S16:
+    case DataType::QS16:
+      StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<int16_t *>(_output->buffer()));
+      break;
+    case DataType::F16:
+      // Not sure this works.
+      StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
+      break;
+    case DataType::U32:
+      StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<uint32_t *>(_output->buffer()));
+      break;
+    case DataType::S32:
+      StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<int32_t *>(_output->buffer()));
+      break;
+    case DataType::F32:
+      StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  _input->unmap(q);
+  _output->unmap(q);
+  _beginData->unmap(q);
+  _endData->unmap(q);
+  _stridesData->unmap(q);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..6426364c9
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+
+#include <vector>
+#include <algorithm>
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr), _qs_kernel(),
+      _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+      _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+      _reorder_negatives_kernel(), _store_kernel()
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                         int total_bits, int bits)
+{
+  _total_bits = total_bits;
+  _bits = bits;
+  _n = input->info()->tensor_shape()[0];
+
+  // _total_bits should be divided by _bits.
+  ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+  _k = k;
+  _radix = 1 << bits;
+
+  _input = input;
+  _values = values;
+  _indices = indices;
+
+  std::string topk_env;
+
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+  }
+  else if (topk_env == "GPU")
+  {
+    // n should be divided by (_GROUPS * _ITEMS)
+    ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+    _hist_buf_size = _radix * _GROUPS * _ITEMS;
+    _glob_sum_buf_size = _HISTOSPLIT;
+
+    _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _hist_buf_size);
+    _glob_sum_buf =
+        cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                   sizeof(cl_int) * _glob_sum_buf_size);
+    _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _glob_sum_buf_size);
+    _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                                         CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+    _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _p_in_key_buf = &_in_key_buf;
+    _p_out_key_buf = &_out_key_buf;
+    _p_in_ind_buf = &_in_ind_buf;
+    _p_out_ind_buf = &_out_ind_buf;
+
+    _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+    _hist_kernel.configure(&_hist_buf, bits, _n);
+    _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+    _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _reorder_kernel.configure(&_hist_buf, bits, _n);
+    _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+    _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+    _store_kernel.configure(values, indices, k, _n);
+  }
+  else
+  {
+    // DO NOTHING for CPU.
+  }
+}
+
+void CLTopKV2::run()
+{
+  std::string topk_env;
+
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    run_on_gpu_single_quicksort();
+  }
+  else if (topk_env == "GPU")
+  {
+    run_on_gpu();
+  }
+  else
+  {
+    run_on_cpu();
+  }
+}
+
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+  // This is a single threaded quick sort implementation.
+  CLScheduler::get().enqueue(_qs_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  // 1. CLTopKV2Init set key buffer and index buffer.
+  //  - Key buffer is set as the same value of the layer's input
+  //  - Values in the index buffer are set as their indices.
+  CLScheduler::get().enqueue(_init_kernel, false);
+
+  int n_passes = _total_bits / _bits;
+
+  // 2. Repeat (total_bits/bits) times.
+  //   - total_bits is the number of bits of the data type (e.g., 32 for float)
+  //   - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+  for (int pass = 0; pass < n_passes; ++pass)
+  {
+    arm_compute::CLScheduler::get().sync();
+
+    // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+    _hist_kernel.setPass(pass, _p_in_key_buf);
+    CLScheduler::get().enqueue(_hist_kernel, false);
+
+    // 2.2. Calculate prefix sum locally with multiple threads
+    CLScheduler::get().enqueue(_scan_hist_kernel, false);
+    // 2.3. Calculate prefix sum within a work group
+    CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+    // 2.4. Calculate global prefix sum
+    CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+    // 2.5. Reorder keys and indices based on the global prefix sum
+    _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+    CLScheduler::get().enqueue(_reorder_kernel, false);
+
+    cl::Buffer *tmp;
+    // swap key buffers
+    tmp = _p_in_key_buf;
+    _p_in_key_buf = _p_out_key_buf;
+    _p_out_key_buf = tmp;
+
+    // swap index buffers
+    tmp = _p_in_ind_buf;
+    _p_in_ind_buf = _p_out_ind_buf;
+    _p_out_ind_buf = tmp;
+  }
+
+  // 3. Get the first negative index
+  // Because we swap in_buf and out_buf at the end of the above for loop,
+  // the output buffers are in bufs.
+  _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+  CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+  // 4. Correct odering of negatives
+  //   - Since radix sort does not consider negatives, negatives are considered as bigger values
+  //   than positives.
+  // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+  _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+                                       _p_out_ind_buf);
+  CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+  // 5. Extract top k values from sorted keys and indices.
+  _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+  CLScheduler::get().enqueue(_store_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+
+#if 0
+  // below code is left for debugging.
+  int first_neg;
+  q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+  std::cout << "first neg = " << first_neg << std::endl;
+
+  float in_key[_n];
+  q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+  }
+
+  float out_key[_n];
+  q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+  }
+
+  int in_ind[_n];
+  q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+  }
+
+  int out_ind[_n];
+  q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+  }
+
+  int hist_buf[_hist_buf_size];
+  q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+  for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+    std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+  }
+
+  int glob_sum_buf[_glob_sum_buf_size];
+  q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+  for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+    std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+  }
+
+#endif
+}
+
+void CLTopKV2::run_on_cpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+  // const Window& w = _topkv2_kernel.window();
+
+  _input->map(q);
+  _values->map(q);
+  _indices->map(q);
+
+  // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+  int row_size = _input->info()->tensor_shape()[0];
+  int rank = _input->info()->num_dimensions();
+
+  if (rank > 2)
+    throw std::runtime_error("Not supported type.");
+
+  int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+                                         (int32 *)_indices->buffer(), (float *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::S32)
+  {
+    nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (int32_t *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::QASYMM8)
+  {
+    nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (uint8_t *)_values->buffer());
+  }
+  else
+  {
+    throw std::runtime_error("Not supported type.");
+  }
+
+  _input->unmap(q);
+  _values->unmap(q);
+  _indices->unmap(q);
+}
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h
new file mode 100644
index 000000000..a18ff0b0d
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/topk_v2.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+
+typedef int32_t int32;
+
+namespace nnfw
+{
+namespace rt
+{
+namespace optimized_ops
+{
+// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+// TFLite.
+//(TFLite additionaly supports kTfLiteInt64.)
+
+// The class that collects top indexes of k values. Based on template
+// tensorflow::gtl::TopN<> but, for optimization,
+// it re-uses the same container.
+template <typename T> class TopContainer
+{
+public:
+  TopContainer() = delete;
+  TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
+  {
+    container_.reserve(std::min(k, row_size) + 1);
+  }
+
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  TopContainer(const TopContainer &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  TopContainer &operator=(const TopContainer &) = delete;
+
+  void start_collecting(const T *values)
+  {
+    values_ = values;
+    container_.clear();
+  }
+
+  void push(int32 a)
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)k_)
+    {
+      container_.push_back(a);
+      if (container_.size() == (size_t)(k_ + 1))
+      {
+        std::make_heap(container_.begin(), container_.end(), comparator);
+        std::pop_heap(container_.begin(), container_.end(), comparator);
+      }
+    }
+    else if (comparator(a, container_.front()))
+    {
+      container_.back() = a;
+      std::push_heap(container_.begin(), container_.end(), comparator);
+      std::pop_heap(container_.begin(), container_.end(), comparator);
+    }
+  }
+
+  const std::vector<int32> &sorted_result()
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)(k_))
+    {
+      std::sort(container_.begin(), container_.end(), comparator);
+    }
+    else
+    {
+      std::sort_heap(container_.begin(), container_.end() - 1, comparator);
+      container_.resize(k_);
+    }
+    return container_;
+  }
+
+private:
+  int32 k_;
+  std::vector<int32> container_;
+  const T *values_ = nullptr;
+
+  bool compare_fun(int32 a, int32 b) const
+  {
+    if (values_[b] < values_[a])
+    {
+      return true;
+    }
+    else if (values_[b] > values_[a])
+    {
+      return false;
+    }
+    else
+    {
+      return a < b;
+    }
+  }
+};
+
+template <typename T>
+void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
+          T *output_values)
+{
+  TopContainer<T> topc(k, row_size);
+  for (int row = 0; row < num_rows; ++row)
+  {
+    const T *values_row = data + row * row_size;
+    topc.start_collecting(values_row);
+    for (int32 c = 0; c < row_size; ++c)
+    {
+      topc.push(c);
+    }
+
+    // Prepare output buffers.
+    int32 *indexes_row = output_indexes + row * k;
+    T *output_row = output_values + row * k;
+    // We always assume that the output is sorted.
+    const auto &top_k = topc.sorted_result();
+    std::copy(top_k.begin(), top_k.end(), indexes_row);
+    std::transform(top_k.begin(), top_k.end(), output_row,
+                   [values_row](const int32 loc) { return values_row[loc]; });
+  }
+}
+
+} // namespace optimized_ops
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt
index 8865a92be..687159725 100644
--- a/libs/CMakeLists.txt
+++ b/libs/CMakeLists.txt
@@ -1,5 +1,3 @@
 add_subdirectory(util)
-if(BUILD_NN_RUNTIME)
-  add_subdirectory(kernel)
-endif(BUILD_NN_RUNTIME)
 add_subdirectory(support)
+add_subdirectory(ARMComputeEx)
diff --git a/libs/kernel/CMakeLists.txt b/libs/kernel/CMakeLists.txt
deleted file mode 100644
index 7da54604d..000000000
--- a/libs/kernel/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-if(${TARGET_ARCH_BASE} STREQUAL "arm" OR ${TARGET_ARCH_BASE} STREQUAL "aarch64")
-  add_subdirectory(acl)
-endif()
diff --git a/libs/kernel/acl/CMakeLists.txt b/libs/kernel/acl/CMakeLists.txt
deleted file mode 100644
index 8f0486e56..000000000
--- a/libs/kernel/acl/CMakeLists.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-set(LIB_KERNELACL kernelacl)
-set(LIB_KERNELACL_TEST kernelacl_test)
-
-# TODO remove this when default goes to c++14
-if(CMAKE_VERSION VERSION_LESS 3.1.0)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-else(CMAKE_VERSION VERSION_LESS 3.1.0)
-  set(CMAKE_CXX_STANDARD 14)
-endif(CMAKE_VERSION VERSION_LESS 3.1.0)
-
-# runtime information
-set(PATH_RUNTIME_NN ${CMAKE_SOURCE_DIR}/runtimes/nn)
-SET(RUNTIME_INCLUDES ${PATH_RUNTIME_NN}/common/include
-                     ${PATH_RUNTIME_NN}/runtime/include
-                     ${PATH_RUNTIME_NN}/depend/hal/include
-                     ${PATH_RUNTIME_NN}/depend/libhidl/base/include
-                     ${PATH_RUNTIME_NN}/depend/libcutils/include
-                     ${PATH_RUNTIME_NN}/depend/libutils/include
-                     ${PATH_RUNTIME_NN}/depend/android-base/include
-                     )
-
-# common
-link_directories(${CMAKE_INSTALL_PREFIX}/lib)
-
-# kernel library
-set(KERNELACL_SRCS "src/Init_acl.cpp"
-                   "src/IO_accessor.cpp"
-                   "src/shape.cpp"
-                   "src/support.cpp"
-                   "src/cl/Conv2D.cpp"
-                   "src/cl/DepthwiseConv2D.cpp"
-                   "src/cl/FullyConnected.cpp"
-                   "src/cl/Pooling.cpp"
-                   "src/cl/Reshape.cpp"
-                   "src/cl/Softmax.cpp"
-                   "src/cl/Concatenation.cpp"
-                   "src/neon/Conv2D.cpp"
-                   "src/neon/DepthwiseConv2D.cpp"
-                   "src/neon/FullyConnected.cpp"
-                   "src/neon/Pooling.cpp"
-                   "src/neon/Softmax.cpp"
-                   "src/neon/Reshape.cpp"
-                   "src/neon/Concatenation.cpp"
-                   )
-
-add_library(${LIB_KERNELACL} SHARED ${KERNELACL_SRCS})
-target_include_directories(${LIB_KERNELACL} PUBLIC
-                           ${NNFW_INCLUDE_DIR}
-                           ${RUNTIME_INCLUDES}
-                           ${NNFW_ACL_INCLUDES}
-                           ${CMAKE_SOURCE_DIR}/include
-                           )
-target_link_libraries(${LIB_KERNELACL} nnfw_support_nnapi)
-if (${TARGET_OS} STREQUAL "tizen")
-  target_link_libraries(${LIB_KERNELACL} nnfw_util ${NNFW_ACL_LIBS} OpenCL)
-else()
-  target_link_libraries(${LIB_KERNELACL} nnfw_util ${NNFW_ACL_LIBS})
-endif()
-install(TARGETS ${LIB_KERNELACL} DESTINATION lib)
-
-# kernel test executable
-set(KERNELACL_TEST_SRCS "src/util.cpp"
-                        "src/gtest_env.cpp"
-                        "src/cl/Conv2D.test.cpp"
-                        "src/cl/DepthwiseConv2D.test.cpp"
-                        "src/cl/FullyConnected.test.cpp"
-                        "src/cl/Pooling.test.cpp"
-                        "src/cl/Reshape.test.cpp"
-                        "src/cl/Softmax.test.cpp"
-                        "src/cl/Concatenation.test.cpp"
-                        "src/neon/Conv2D.test.cpp"
-                        "src/neon/DepthwiseConv2D.test.cpp"
-                        "src/neon/FullyConnected.test.cpp"
-                        "src/neon/Pooling.test.cpp"
-                        "src/neon/Softmax.test.cpp"
-                        "src/neon/Reshape.test.cpp"
-                        "src/neon/Concatenation.test.cpp"
-                        )
-
-add_executable(${LIB_KERNELACL_TEST} ${KERNELACL_TEST_SRCS})
-target_include_directories(${LIB_KERNELACL_TEST} PUBLIC
-                           ${NNFW_INCLUDE_DIR}
-                           ${RUNTIME_INCLUDES}
-                           ${NNFW_ACL_INCLUDES}
-                           )
-if (NOT ${TARGET_OS} STREQUAL "tizen")
-  add_dependencies(${LIB_KERNELACL_TEST} googletest)
-endif()
-target_link_libraries(${LIB_KERNELACL_TEST}
-                      ${LIB_KERNELACL}
-                      nnfw_util ${NNFW_ACL_LIBS}
-                      ${NNFW_GTEST_LIBS}
-                      )
-install(TARGETS ${LIB_KERNELACL_TEST} DESTINATION unittest)
diff --git a/libs/kernel/acl/src/CLUniqueTensor.h b/libs/kernel/acl/src/CLUniqueTensor.h
deleted file mode 100644
index 6844e4565..000000000
--- a/libs/kernel/acl/src/CLUniqueTensor.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__
-#define __NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__
-
-#include <arm_compute/runtime/CL/CLTensor.h>
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-class CLUniqueTensor
-{
-public:
-  CLUniqueTensor(const ::arm_compute::TensorInfo &info)
-  {
-    _tensor.allocator()->init(info);
-  }
-
-public:
-  // Both copy and move are not allowed
-  CLUniqueTensor(const CLUniqueTensor &) = delete;
-  CLUniqueTensor(CLUniqueTensor &&) = delete;
-
-public:
-  ~CLUniqueTensor()
-  {
-    _tensor.allocator()->free();
-  }
-
-public:
-  void allocate()
-  {
-    _tensor.allocator()->allocate();
-  }
-
-public:
-  ::arm_compute::CLTensor &ref(void) { return _tensor; }
-  ::arm_compute::CLTensor *ptr(void) { return &_tensor; }
-
-private:
-  ::arm_compute::CLTensor _tensor;
-};
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif //__NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__
diff --git a/libs/kernel/acl/src/DepthwiseConv2D.h b/libs/kernel/acl/src/DepthwiseConv2D.h
deleted file mode 100644
index 8af8d4fd0..000000000
--- a/libs/kernel/acl/src/DepthwiseConv2D.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__
-#define __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/runtime/IFunction.h>
-
-#include "shape.h"
-#include "IO_accessor.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-namespace common {
-
-typedef std::function<void (void)> sync_scheduler_f;
-
-template<class TensorT, class LayerT, class ActT>
-bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                          const float* filterData, const nnfw::rt::Shape& filterShape,
-                          const float* biasData, const nnfw::rt::Shape& biasShape,
-                          int32_t padding_left, int32_t padding_right,
-                          int32_t padding_top, int32_t padding_bottom,
-                          int32_t stride_width, int32_t stride_height,
-                          int32_t depth_multiplier, int32_t activation,
-                          float* outputData, const nnfw::rt::Shape& outputShape,
-                          sync_scheduler_f sync_scheduler) {
-  auto inputShapeACL = util::fromNNShape(inputShape);
-  auto weightsShapeACL = util::fromNNShape(filterShape);
-  auto biasShapeACL = util::fromNNShape(biasShape);
-  auto outputShapeACL = util::fromNNShape(outputShape);
-
-  TensorT input(arm_compute::TensorInfo(inputShapeACL, arm_compute::Format::F32));
-  TensorT weights(arm_compute::TensorInfo(weightsShapeACL, arm_compute::Format::F32));
-  TensorT bias(arm_compute::TensorInfo(biasShapeACL, arm_compute::Format::F32));
-  TensorT output(arm_compute::TensorInfo(outputShapeACL, arm_compute::Format::F32));
-
-  arm_compute::PadStrideInfo psinfo = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  auto l = std::make_shared<LayerT>();
-  l->configure(input.ptr(), weights.ptr(), bias.ptr(), output.ptr(), psinfo);
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  fns.emplace_back(l);
-
-  util::insertFusedActivationLayer<TensorT, ActT>(output, activation, fns);
-
-  input.allocate();
-  output.allocate();
-  bias.allocate();
-  weights.allocate();
-
-  // TODO: Do we need 2D tensor accessor for the input feature?
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
-  TensorAccess<WeightAccessor>(weights.ref(), filterData, filterShape);
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  sync_scheduler();
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-} // namespace common
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__
diff --git a/libs/kernel/acl/src/DepthwiseConv2D.test.h b/libs/kernel/acl/src/DepthwiseConv2D.test.h
deleted file mode 100644
index b2c8592ee..000000000
--- a/libs/kernel/acl/src/DepthwiseConv2D.test.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/DepthwiseConv2D.h>
-
-// TODO: fix include path in CMakeFiles
-#include "util.h"
-
-#ifndef ACL_TEST
-#error "ACL_TEST should be defined first!"
-#endif // ACL_TEST
-
-#ifndef ACL_CORE_FUNC_NAME
-#error "ACL_CORE_FUNC_NAME should be defined first!"
-#endif // ACL_CORE_FUNC_NAME
-
-using namespace nnfw::kernel::acl;
-
-ACL_TEST(KernelACL_TC, dwise_conv2d_1) {
-  uint32_t input_n = 1;
-  uint32_t input_h = 3;
-  uint32_t input_w = 3;
-  uint32_t input_c = 1;
-  uint32_t filter_h = 3;
-  uint32_t filter_w = 3;
-  uint32_t filter_c = 1;
-  uint32_t out_h = 1;
-  uint32_t out_w = 1;
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t depth_multiplier = 1;
-
-  util::TensorWrapper input({input_n, input_h, input_w, input_c});
-  util::TensorWrapper weights({1, filter_h, filter_w, filter_c});
-  util::TensorWrapper bias({filter_c});
-  util::TensorWrapper output({1, out_h, out_w, filter_c});
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    uint32_t N = input_n;
-    uint32_t H = input_h;
-    uint32_t W = input_w;
-    uint32_t C = input_c;
-
-    return n*H*W*C + h*W*C + w*C + c;
-  });
-  weights.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    uint32_t N = 1;
-    uint32_t H = filter_h;
-    uint32_t W = filter_w;
-    uint32_t C = filter_c;
-
-    return n*H*W*C + h*W*C + w*C + c;
-  });
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weights.ptr<float>(), weights.shape(),
-      bias.ptr<float>(), bias.shape(),
-      padding_left, padding_right,
-      padding_top, padding_bottom,
-      stride_width, stride_height,
-      depth_multiplier, activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1, out_h, out_w, filter_c});
-  expected.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 204.f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-ACL_TEST(KernelACL_TC, dwise_conv2d_multi_channel) {
-  uint32_t input_n = 1;
-  uint32_t input_h = 3;
-  uint32_t input_w = 3;
-  uint32_t input_c = 3;
-  uint32_t filter_h = 3;
-  uint32_t filter_w = 3;
-  uint32_t filter_c = input_c;
-  uint32_t out_h = 1;
-  uint32_t out_w = 1;
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t depth_multiplier = 1;
-
-  util::TensorWrapper input({input_n, input_h, input_w, input_c});
-  util::TensorWrapper weights({1, filter_h, filter_w, filter_c});
-  util::TensorWrapper bias({filter_c});
-  util::TensorWrapper output({1, out_h, out_w, filter_c});
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    uint32_t N = input_n;
-    uint32_t H = input_h;
-    uint32_t W = input_w;
-    uint32_t C = input_c;
-
-    return n*H*W*C + h*W*C + w*C + c;
-  });
-  weights.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    uint32_t N = 1;
-    uint32_t H = filter_h;
-    uint32_t W = filter_w;
-    uint32_t C = filter_c;
-
-    return n*H*W*C + h*W*C + w*C + c;
-  });
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weights.ptr<float>(), weights.shape(),
-      bias.ptr<float>(), bias.shape(),
-      padding_left, padding_right,
-      padding_top, padding_bottom,
-      stride_width, stride_height,
-      depth_multiplier, activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1, out_h, out_w, filter_c});
-  expected.initValue({
-    1836.f,
-    2061.f,
-    2304.f
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-ACL_TEST(KernelACL_TC, dwise_conv2d_inception_1) {
-  uint32_t input_n = 1;
-  uint32_t input_h = 112;
-  uint32_t input_w = 112;
-  uint32_t input_c = 32;
-  uint32_t filter_h = 3;
-  uint32_t filter_w = 3;
-  uint32_t filter_c = input_c;
-  uint32_t out_h = 112;
-  uint32_t out_w = 112;
-
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t depth_multiplier = 1;
-
-  util::TensorWrapper input({input_n, input_h, input_w, input_c});
-  util::TensorWrapper weights({1, filter_h, filter_w, filter_c});
-  util::TensorWrapper bias({filter_c});
-  util::TensorWrapper output({1, out_h, out_w, filter_c});
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU6);
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return c;
-  });
-  weights.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return c;
-  });
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weights.ptr<float>(), weights.shape(),
-      bias.ptr<float>(), bias.shape(),
-      padding_left, padding_right,
-      padding_top, padding_bottom,
-      stride_width, stride_height,
-      depth_multiplier, activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1, out_h, out_w, filter_c});
-  expected.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    float v = 9.f;
-    if( h == 0 || h == out_h-1 )
-      v -= 3.f;
-    if( w == 0 || w == out_w-1 )
-      v -= 3.f;
-
-    // four corners
-    if( (w == 0 && h == 0)
-     || (w == 0 && h == out_h-1)
-     || (w == out_w-1 && h == 0)
-     || (w == out_w-1 && h == out_h-1) )
-      v += 1.f;
-
-    // Assumption: negative numbers cannot appear because
-    // only positive numbers exist in the input and weights.
-    float ret = c*c*v;
-    return std::min(ret, 6.f);
-  });
-
-  EXPECT_EQ(output, expected);
-}
diff --git a/libs/kernel/acl/src/FullyConnected.h b/libs/kernel/acl/src/FullyConnected.h
deleted file mode 100644
index 5030a8548..000000000
--- a/libs/kernel/acl/src/FullyConnected.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__
-#define __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/runtime/IFunction.h>
-
-#include "shape.h"
-#include "IO_accessor.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-namespace common {
-
-typedef std::function<void (void)> sync_scheduler_f;
-
-template<class TensorT, class LayerT, class ActT>
-bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
-                           const float* biasData, const nnfw::rt::Shape& biasShape,
-                           int32_t activation,
-                           float* outputData, const nnfw::rt::Shape& outputShape,
-                           sync_scheduler_f sync_scheduler) {
-
-  // NNAPI specification: https://developer.android.com/ndk/reference/group___neural_networks.html#ggaabbe492c60331b13038e39d4207940e0aaada7a3dbaf4676aba560c933ff610c5
-
-  // According to the NNAPI Specification,
-  // INPUT
-  // 1. input rank is up to 4.
-  // 2. if input rank > 2, it is flattened to rank 2 [batch_size, input_size]
-  nnfw::rt::Shape flattenedInputShape = inputShape;
-  switch(inputShape.dimensions.size()) {
-  case 1:
-    {
-      assert("Need to be implemented." && 0);
-      break;
-    }
-  case 2:
-    {
-      // DO NOTHING.
-      break;
-    }
-  case 3:
-    {
-      assert("Need to be implemented." && 0);
-      break;
-    }
-  case 4:
-    {
-      auto N = inputShape.dimensions[0];
-      auto H = inputShape.dimensions[1];
-      auto W = inputShape.dimensions[2];
-      auto C = inputShape.dimensions[3];
-      flattenedInputShape.dimensions = {N, H*W*C};
-      break;
-    }
-  default:
-    assert(inputShape.dimensions.size() <= 4);
-  }
-  // Finally, flattenedInputShape is a 2D tensor.
-
-  // WEIGHTS is a 2D tensor
-  assert(weightsShape.dimensions.size() == 2);
-
-  // BIAS is a 1D tensor
-  assert(biasShape.dimensions.size() == 1);
-
-  // OUTPUT is a 2D tensor.
-  assert(outputShape.dimensions.size() == 2);
-
-  auto input_shape = util::fromNNShape(flattenedInputShape);
-  auto weights_shape = util::fromNNShape(weightsShape);
-  auto bias_shape = util::fromNNShape(biasShape);
-  auto output_shape = util::fromNNShape(outputShape);
-
-  assert(activation == ANEURALNETWORKS_FUSED_NONE || activation == ANEURALNETWORKS_FUSED_RELU);
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  TensorT input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  TensorT output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-  TensorT bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32));
-  TensorT weights(arm_compute::TensorInfo(weights_shape, arm_compute::Format::F32));
-
-  auto fc = std::make_shared<LayerT>();
-  fc->configure(input.ptr(), weights.ptr(), bias.ptr(), output.ptr());
-
-  fns.emplace_back(fc);
-
-  if (ANEURALNETWORKS_FUSED_RELU == activation)
-  {
-    auto relu_f = std::make_shared<ActT>();
-
-    const arm_compute::ActivationLayerInfo relu_info{arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
-
-    // Do in-place update
-    relu_f->configure(output.ptr(), nullptr, relu_info);
-
-    fns.emplace_back(relu_f);
-  }
-
-  input.allocate();
-  output.allocate();
-  bias.allocate();
-  weights.allocate();
-
-  // TODO: Do we need 2D tensor accessor for the input feature?
-  TensorAccess<MatrixWeightAccessor>(input.ref(), inputData, inputShape);
-  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
-  TensorAccess<MatrixWeightAccessor>(weights.ref(), weightsData, weightsShape);
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  sync_scheduler();
-
-  TensorAccess<MatrixOutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-} // namespace common
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__
diff --git a/libs/kernel/acl/src/FullyConnected.test.h b/libs/kernel/acl/src/FullyConnected.test.h
deleted file mode 100644
index 01bbff802..000000000
--- a/libs/kernel/acl/src/FullyConnected.test.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/FullyConnected.h>
-
-// TODO: fix include path in CMakeFiles
-#include "util.h"
-
-#ifndef ACL_TEST
-#error "ACL_TEST should be defined first!"
-#endif // ACL_TEST
-
-#ifndef ACL_CORE_FUNC_NAME
-#error "ACL_CORE_FUNC_NAME should be defined first!"
-#endif // ACL_CORE_FUNC_NAME
-
-using namespace nnfw::kernel::acl;
-using fullyConnectedFloat32T = bool (*)(const float* inputData, const nnfw::rt::Shape& inputShape,
-                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
-                           const float* biasData, const nnfw::rt::Shape& biasShape,
-                           int32_t activation,
-                           float* outputData, const nnfw::rt::Shape& outputShape);
-
-ACL_TEST(KernelACL_TC, fcFloat32_1) {
-
-  util::TensorWrapper input({1,1,1,100});
-  util::TensorWrapper weights({1,100});
-  util::TensorWrapper bias({1});
-  util::TensorWrapper output({1,1});
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.f;
-  });
-  weights.initValue([](uint32_t h, uint32_t w) {
-    return 1.f;
-  });
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-  output.initValue([](uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weights.ptr<float>(), weights.shape(),
-      bias.ptr<float>(), bias.shape(),
-      activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,1});
-  expected.initValue([](uint32_t h, uint32_t w) {
-    return 100.f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-ACL_TEST(KernelACL_TC, fcFloat32_relu) {
-
-  util::TensorWrapper input({1,1,1,100});
-  util::TensorWrapper weights({1,100});
-  util::TensorWrapper bias({1});
-  util::TensorWrapper output({1,1});
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.f;
-  });
-  weights.initValue([](uint32_t h, uint32_t w) {
-    return -1.f;
-  });
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-  output.initValue([](uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weights.ptr<float>(), weights.shape(),
-      bias.ptr<float>(), bias.shape(),
-      activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,1});
-  expected.initValue([](uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-ACL_TEST(KernelACL_TC, fcFloat32_conv_fc) {
-  uint32_t input_n = 1;
-  uint32_t input_c = 5;
-  uint32_t input_h = 4;
-  uint32_t input_w = 4;
-  uint32_t weight_n = 6;
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  util::TensorWrapper input({input_n, input_h, input_w, input_c});
-  util::TensorWrapper weight({weight_n, input_c*input_h*input_w});
-  util::TensorWrapper bias({weight_n});
-  util::TensorWrapper output({1, weight_n});
-
-  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    uint32_t N = input_n;
-    uint32_t H = input_h;
-    uint32_t W = input_w;
-    uint32_t C = input_c;
-
-    return n*H*W*C + h*W*C + w*C + c;
-  });
-
-  weight.initValue([&](uint32_t h, uint32_t w) {
-    uint32_t H = weight_n;
-    uint32_t W = input_c*input_h*input_w;
-
-    return h*W + w;
-  });
-
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-
-  output.initValue([](uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weight.ptr<float>(), weight.shape(),
-      bias.ptr<float>(), bias.shape(),
-      activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1, weight_n});
-  expected.initValue({
-      167480.f,
-      420280.f,
-      673080.f,
-      925880.f,
-      1178680.f,
-      1431480.f});
-
-  EXPECT_EQ(output, expected);
-}
-
-ACL_TEST(KernelACL_TC, fcFloat32_fc_fc) {
-  uint32_t input_n = 6;
-  uint32_t weight_n = 6;
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  util::TensorWrapper input({1, input_n});
-  util::TensorWrapper weight({weight_n, input_n});
-  util::TensorWrapper bias({weight_n});
-  util::TensorWrapper output({1, weight_n});
-
-  input.initValue([&](uint32_t h, uint32_t w) {
-    // not use h because h = 0.
-    return (float)w;
-  });
-
-  weight.initValue([&](uint32_t h, uint32_t w) {
-    uint32_t H = weight_n;
-    uint32_t W = input_n;
-
-    return (float)(h*W + w);
-  });
-
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-
-  output.initValue([](uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weight.ptr<float>(), weight.shape(),
-      bias.ptr<float>(), bias.shape(),
-      activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1, weight_n});
-  expected.initValue({
-    55.f,
-    145.f,
-    235.f,
-    325.f,
-    415.f,
-    505.f,
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-ACL_TEST(KernelACL_TC, fcFloat32_inceptionv3) {
-
-  uint32_t input_c = 2048;
-  uint32_t weight_n = 1008;
-
-  util::TensorWrapper input({1,1,1,input_c});
-  util::TensorWrapper weight({weight_n,input_c});
-  util::TensorWrapper bias({weight_n});
-  util::TensorWrapper output({1, weight_n});
-
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-
-  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.f;
-  });
-  weight.initValue([&](uint32_t h, uint32_t w) {
-    return (float)h;
-  });
-  bias.initValue([](uint32_t w) {
-    return 0.f;
-  });
-  output.initValue([](uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
-      weight.ptr<float>(), weight.shape(),
-      bias.ptr<float>(), bias.shape(),
-      activation,
-      output.ptr<float>(), output.shape());
-
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1, weight_n});
-  expected.initValue([&](uint32_t h, uint32_t w) {
-    return w*input_c;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
diff --git a/libs/kernel/acl/src/IO_accessor.cpp b/libs/kernel/acl/src/IO_accessor.cpp
deleted file mode 100644
index 410fb8ea5..000000000
--- a/libs/kernel/acl/src/IO_accessor.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "IO_accessor.h"
-
-#include <cassert>
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-InputAccessor::InputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape)
-  : _inputData(inputData)
-  , _inputShape(inputShape)
-{
-}
-
-MatrixInputAccessor::MatrixInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape)
-  : _inputData(inputData)
-  , _inputShape(inputShape)
-{
-}
-
-VectorInputAccessor::VectorInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape)
-  : _inputData(inputData)
-  , _inputShape(inputShape)
-{
-}
-
-WeightAccessor::WeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape)
-  : _filterData(filterData)
-  , _filterShape(filterShape)
-{
-}
-
-MatrixWeightAccessor::MatrixWeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape)
-  : _filterData(filterData)
-  , _filterShape(filterShape)
-{
-}
-
-BiasAccessor::BiasAccessor(const float* biasData, const nnfw::rt::Shape& biasShape)
-  : _biasData(biasData)
-  , _biasShape(biasShape)
-{
-}
-
-OutputAccessor::OutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape)
-  : _outputData(outputData)
-  , _outputShape(outputShape)
-{
-}
-
-MatrixOutputAccessor::MatrixOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape)
-  : _outputData(outputData)
-  , _outputShape(outputShape)
-{
-}
-
-VectorOutputAccessor::VectorOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape)
-  : _outputData(outputData)
-  , _outputShape(outputShape)
-{
-}
-
-static uint32_t getOffsetNCHW(const nnfw::rt::Shape& shape, const arm_compute::Coordinates& id)
-{
-  // get offset for ACL(NCHW) from data of NNAPI(NHWC)
-  uint32_t num    = getSizeOfDimension(shape, 0);
-  uint32_t height = getSizeOfDimension(shape, 1);
-  uint32_t width  = getSizeOfDimension(shape, 2);
-  uint32_t chann  = getSizeOfDimension(shape, 3);
-  uint32_t stride = 1;
-  uint32_t offset = 0;
-  uint32_t numdim = id.num_dimensions();
-  offset += numdim > 0 ? id[0] * stride : 0; stride *= width;
-  offset += numdim > 1 ? id[1] * stride : 0; stride *= height;
-  offset += numdim > 2 ? id[2] * stride : 0; stride *= chann;
-  offset += numdim > 3 ? id[3] * stride : 0; stride *= num;
-  return offset;
-}
-
-static uint32_t getElementOffset(const nnfw::rt::Shape& shape,
-                                 uint32_t ch, uint32_t row, uint32_t col)
-{
-  assert(getSizeOfDimension(shape, 0) == 1);
-  assert(shape.dimensions.size() == 4);
-
-  // TODO Optimize this!
-  const uint32_t W = getSizeOfDimension(shape, 2);
-  const uint32_t C = getSizeOfDimension(shape, 3);
-
-  int offset = 0;
-
-  // NNAPI uses NHWC ordering
-  offset += row * W * C;
-  offset += col * C;
-  offset += ch;
-
-  return offset;
-}
-
-static uint32_t getElementOffset(const nnfw::rt::Shape& shape,
-                                 uint32_t nth, uint32_t ch, uint32_t row, uint32_t col)
-{
-  assert(shape.dimensions.size() == 4);
-
-  // TODO Optimize this!
-  const uint32_t H = getSizeOfDimension(shape, 1);
-  const uint32_t W = getSizeOfDimension(shape, 2);
-  const uint32_t C = getSizeOfDimension(shape, 3);
-
-  int offset = 0;
-
-  // NNAPI uses NHWC ordering
-  offset += nth * H * W * C;
-  offset += row * W * C;
-  offset += col * C;
-  offset += ch;
-
-  return offset;
-}
-
-bool InputAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const uint32_t ch = id[2];
-    const uint32_t row = id[1];
-    const uint32_t col = id[0];
-
-    uint32_t offset = getElementOffset(_inputShape, ch, row, col);
-
-    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
-        *(_inputData + offset);
-  });
-  return true;
-}
-
-bool MatrixInputAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  assert(tensor.info()->tensor_shape().num_dimensions() <= 2);
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const auto row = id[1];
-    const auto col = id[0];
-    const auto W = tensor.info()->tensor_shape().x();
-
-    const auto offset = row * W + col;
-
-    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
-        *(_inputData + offset);
-  });
-  return true;
-}
-
-bool VectorInputAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  assert(tensor.info()->tensor_shape().num_dimensions() == 1);
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    uint32_t offset = id[0];
-
-    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
-        *(_inputData + offset);
-  });
-  return true;
-}
-
-bool WeightAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const uint32_t nth = id[3];
-    const uint32_t ch = id[2];
-    const uint32_t row = id[1];
-    const uint32_t col = id[0];
-
-    uint32_t offset = getElementOffset(_filterShape, nth, ch, row, col);
-
-    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
-        *(_filterData + offset);
-  });
-  return true;
-}
-
-bool MatrixWeightAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  assert(tensor.info()->tensor_shape().num_dimensions() <= 2);
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const auto row = id[1];
-    const auto col = id[0];
-    const auto W = tensor.info()->tensor_shape().x();
-
-    uint32_t offset = row * W + col;
-
-    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
-        *(_filterData + offset);
-  });
-  return true;
-}
-
-bool BiasAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    uint32_t offset = getOffsetNCHW(_biasShape, id);
-    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
-        *(_biasData + offset);
-  });
-  return true;
-}
-
-bool OutputAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const uint32_t ch = id[2];
-    const uint32_t row = id[1];
-    const uint32_t col = id[0];
-
-    uint32_t offset = getElementOffset(_outputShape, ch, row, col);
-
-    *(_outputData + offset) =
-        *reinterpret_cast<float *>(tensor.ptr_to_element(id));
-  });
-  return false; // end the network
-}
-
-bool VectorOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  assert(tensor.info()->tensor_shape().num_dimensions() == 1);
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const uint32_t x = id[0];
-
-    uint32_t offset = x;
-
-    *(_outputData + offset) =
-        *reinterpret_cast<float *>(tensor.ptr_to_element(id));
-  });
-  return false; // end the network
-}
-
-bool MatrixOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
-{
-  arm_compute::Window window;
-  window.use_tensor_dimensions(tensor.info()->tensor_shape());
-
-  assert(tensor.info()->tensor_shape().num_dimensions() <= 2);
-
-  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
-  {
-    const auto row = id[1];
-    const auto col = id[0];
-    const auto W = tensor.info()->tensor_shape().x();
-
-    const auto offset = row * W + col;
-
-    *(_outputData + offset) =
-        *reinterpret_cast<float *>(tensor.ptr_to_element(id));
-  });
-  return false; // end the network
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/IO_accessor.h b/libs/kernel/acl/src/IO_accessor.h
deleted file mode 100644
index e7670f15c..000000000
--- a/libs/kernel/acl/src/IO_accessor.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_IO_ACCESSOR_H__
-#define __NNFW_KERNEL_ACL_IO_ACCESSOR_H__
-
-#include <arm_compute/graph/ITensorAccessor.h>
-#include <arm_compute/runtime/CL/CLFunctions.h>
-#include <arm_compute/runtime/NEON/NEFunctions.h>
-
-#include <OperationsUtils.h> // for nnfw::rt::Shape
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-class InputAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    InputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape);
-    InputAccessor(InputAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    const float* _inputData;
-    const nnfw::rt::Shape& _inputShape;
-};
-
-class MatrixInputAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    MatrixInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape);
-    MatrixInputAccessor(MatrixInputAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    const float* _inputData;
-    const nnfw::rt::Shape& _inputShape;
-};
-
-class VectorInputAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    VectorInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape);
-    VectorInputAccessor(VectorInputAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    const float* _inputData;
-    const nnfw::rt::Shape& _inputShape;
-};
-
-class WeightAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    WeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape);
-    WeightAccessor(WeightAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    const float* _filterData;
-    const nnfw::rt::Shape& _filterShape;
-};
-
-class MatrixWeightAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    MatrixWeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape);
-    MatrixWeightAccessor(MatrixWeightAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    const float* _filterData;
-    const nnfw::rt::Shape& _filterShape;
-};
-
-class BiasAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    BiasAccessor(const float* biasData, const nnfw::rt::Shape& biasShape);
-    BiasAccessor(BiasAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    const float* _biasData;
-    const nnfw::rt::Shape& _biasShape;
-};
-
-class OutputAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    OutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape);
-    OutputAccessor(OutputAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    float* _outputData;
-    const nnfw::rt::Shape& _outputShape;
-};
-
-class MatrixOutputAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    MatrixOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape);
-    MatrixOutputAccessor(MatrixOutputAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    float* _outputData;
-    const nnfw::rt::Shape& _outputShape;
-};
-
-class VectorOutputAccessor : public arm_compute::graph::ITensorAccessor
-{
-public:
-    VectorOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape);
-    VectorOutputAccessor(VectorOutputAccessor&&) = default;
-
-    // Inherited methods overriden:
-    bool access_tensor(arm_compute::ITensor& tensor) override;
-
-private:
-    float* _outputData;
-    const nnfw::rt::Shape& _outputShape;
-};
-
-template<typename AccessorType>
-inline void TensorAccess(arm_compute::CLTensor& tensor, const float* data,
-                         const nnfw::rt::Shape& shape)
-{
-  tensor.map();
-  AccessorType accessor(data, shape);
-  accessor.access_tensor(tensor);
-  tensor.unmap();
-}
-
-template<typename AccessorType>
-inline void TensorAccess(arm_compute::CLTensor& tensor, float* data,
-                         const nnfw::rt::Shape& shape)
-{
-  tensor.map();
-  AccessorType accessor(data, shape);
-  accessor.access_tensor(tensor);
-  tensor.unmap();
-}
-
-template<typename AccessorType>
-inline void TensorAccess(arm_compute::Tensor& tensor, const float* data,
-                         const nnfw::rt::Shape& shape)
-{
-  AccessorType accessor(data, shape);
-  accessor.access_tensor(tensor);
-}
-
-template<typename AccessorType>
-inline void TensorAccess(arm_compute::Tensor& tensor, float* data,
-                         const nnfw::rt::Shape& shape)
-{
-  AccessorType accessor(data, shape);
-  accessor.access_tensor(tensor);
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_ACL_IO_ACCESSOR_H__
diff --git a/libs/kernel/acl/src/NEUniqueTensor.h b/libs/kernel/acl/src/NEUniqueTensor.h
deleted file mode 100644
index 34412f9e3..000000000
--- a/libs/kernel/acl/src/NEUniqueTensor.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__
-#define __NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__
-
-#include <arm_compute/runtime/Tensor.h>
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-// TODO: find a way to merge CLUniqueTensor and NEUniqueTensor.
-class NEUniqueTensor
-{
-public:
-  NEUniqueTensor(const ::arm_compute::TensorInfo &info)
-  {
-    _tensor.allocator()->init(info);
-  }
-
-public:
-  // Both copy and move are not allowed
-  NEUniqueTensor(const NEUniqueTensor &) = delete;
-  NEUniqueTensor(NEUniqueTensor &&) = delete;
-
-public:
-  ~NEUniqueTensor()
-  {
-    _tensor.allocator()->free();
-  }
-
-public:
-  void allocate()
-  {
-    _tensor.allocator()->allocate();
-  }
-
-public:
-  ::arm_compute::Tensor &ref(void) { return _tensor; }
-  ::arm_compute::Tensor *ptr(void) { return &_tensor; }
-
-private:
-  ::arm_compute::Tensor _tensor;
-};
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif //__NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__
diff --git a/libs/kernel/acl/src/Reshape.h b/libs/kernel/acl/src/Reshape.h
deleted file mode 100644
index ebd82477d..000000000
--- a/libs/kernel/acl/src/Reshape.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__
-#define __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-// TODO: fix include path in CMakeFiles
-#include "IO_accessor.h"
-#include "shape.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-namespace common {
-
-typedef std::function<void (void)> sync_scheduler_f;
-
-template<class TensorT, class LayerT>
-bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape,
-                    void* outputData, const nnfw::rt::Shape& outputShape,
-                    sync_scheduler_f sync_scheduler) {
-
-  auto input_shape = util::fromNNShape(inputShape);
-  auto output_shape = util::fromNNShape(outputShape);
-
-  TensorT input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  TensorT output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  LayerT l;
-
-  l.configure(input.ptr(), output.ptr());
-
-  input.allocate();
-  output.allocate();
-
-  TensorAccess<InputAccessor>(input.ref(), (float*)inputData, inputShape);
-
-  l.run();
-
-  sync_scheduler();
-
-  TensorAccess<OutputAccessor>(output.ref(), (float*)outputData, outputShape);
-
-  return true;
-}
-
-} // namespace common
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__
diff --git a/libs/kernel/acl/src/Reshape.test.h b/libs/kernel/acl/src/Reshape.test.h
deleted file mode 100644
index a96a896a6..000000000
--- a/libs/kernel/acl/src/Reshape.test.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/Reshape.h>
-
-// TODO: fix include path in CMakeFiles
-#include "util.h"
-
-#ifndef ACL_TEST
-#error "ACL_TEST should be defined first!"
-#endif // ACL_TEST
-
-#ifndef ACL_CORE_FUNC_NAME
-#error "ACL_CORE_FUNC_NAME should be defined first!"
-#endif // ACL_CORE_FUNC_NAME
-
-using namespace nnfw::kernel::acl;
-
-ACL_TEST(KernelACL_TC, reshape_1) {
-  const nnfw::rt::Shape inputShape = {OperandType::FLOAT32, {1,1,9,1}, 1.0, 0};
-  float inputData[9] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
-
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float outputData[9] = {0};
-
-  bool bret = ACL_CORE_FUNC_NAME(inputData, inputShape,
-      outputData, outputShape);
-
-  EXPECT_EQ(bret, true);
-
-  float expectData[9] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-
-}
diff --git a/libs/kernel/acl/src/cl/Concatenation.cpp b/libs/kernel/acl/src/cl/Concatenation.cpp
deleted file mode 100644
index 9376006ca..000000000
--- a/libs/kernel/acl/src/cl/Concatenation.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-#include <cassert>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
-                          const std::vector<nnfw::rt::Shape>& inputShapes, int32_t axis,
-                          float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  if (axis != 3)
-  {
-    assert("Only support axis=3 for ACL" && 0);
-    return false;
-  }
-  assert(inputDataPtrs.size() == inputShapes.size());
-
-  std::vector<arm_compute::CLTensor*> inputPtrs;
-  std::vector<arm_compute::ICLTensor*> inputIptrs;
-  arm_compute::CLTensor output;
-
-  // init Tensors
-  std::vector<nnfw::rt::Shape>::const_iterator it_inputShape = inputShapes.begin();
-  for (auto inputData : inputDataPtrs)
-  {
-    const nnfw::rt::Shape& inputShape = *it_inputShape;
-    arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-    arm_compute::CLTensor* inputPtr = new arm_compute::CLTensor();
-
-    inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-    inputPtrs.push_back(inputPtr);
-    inputIptrs.push_back(inputPtr);
-
-    it_inputShape++;
-  }
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-  output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  // prepare ACL Concatenate and configure tensors
-  auto concat = std::make_shared<arm_compute::CLDepthConcatenateLayer>();
-  concat->configure(inputIptrs, &output);
-
-  // allocate Tensors
-  it_inputShape = inputShapes.begin();
-  std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin();
-  for (auto inputPtr : inputPtrs)
-  {
-    inputPtr->allocator()->allocate();
-
-    const float* inputData = *it_inputData;
-    const nnfw::rt::Shape& inputShape = *it_inputShape;
-
-    TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape);
-
-    it_inputShape++;
-    it_inputData++;
-  }
-  output.allocator()->allocate();
-
-  // run
-  concat->run();
-  arm_compute::CLScheduler::get().sync();
-
-  // get output
-  TensorAccess<OutputAccessor>(output, outputData, outputShape);
-
-  // cleanup
-  for (auto inputPtr : inputPtrs)
-  {
-    inputPtr->allocator()->free();
-    delete inputPtr;
-  }
-  output.allocator()->free();
-
-  return true;
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Concatenation.test.cpp b/libs/kernel/acl/src/cl/Concatenation.test.cpp
deleted file mode 100644
index b2c5a5891..000000000
--- a/libs/kernel/acl/src/cl/Concatenation.test.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/Concatenation.h>
-
-// TODO: fix include path in CMakeFiles
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, concatFloat32_1)
-{
-  float inputData_1[6] = {
-    1, 2, 3, 4, 5, 6      // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ]
-  };
-  float inputData_2[6] = {
-    7, 8, 9, 10, 11, 12   // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ]
-  };
-  const nnfw::rt::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
-  const nnfw::rt::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
-  std::vector<const float*> inputDataPtrs;
-  std::vector<nnfw::rt::Shape> inputShapes;
-  float outputData[12];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 };
-  bool bret;
-
-  inputDataPtrs.push_back(inputData_1);
-  inputDataPtrs.push_back(inputData_2);
-  inputShapes.push_back(inputShape_1);
-  inputShapes.push_back(inputShape_2);
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = concatenationFloat32(inputDataPtrs, inputShapes, 3, 
-                              outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectNCHW[] = {
-    1, 2, 3, 4, 5, 6,
-    7, 8, 9, 10, 11, 12
-  };
-  float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ]
-  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
diff --git a/libs/kernel/acl/src/cl/Conv2D.cpp b/libs/kernel/acl/src/cl/Conv2D.cpp
deleted file mode 100644
index 4783bdc1d..000000000
--- a/libs/kernel/acl/src/cl/Conv2D.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <NeuralNetworks.h>
-
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-#include <util/environment.h>
-
-#include "../IO_accessor.h"
-#include "../util.h"
-#include "../shape.h"
-#include "../CLUniqueTensor.h"
-#include "../support.h"
-
-#include "util/feature/TextFormatter.h"
-
-#include "support/nnapi/feature/Reader.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-static int verbose = 0;
-
-bool convFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                 const float* filterData, const nnfw::rt::Shape& filterShape,
-                 const float* biasData, const nnfw::rt::Shape& biasShape,
-                 int32_t padding_left, int32_t padding_right,
-                 int32_t padding_top, int32_t padding_bottom,
-                 int32_t stride_width, int32_t stride_height,
-                 int32_t activation,
-                 float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape filter_shape = util::fromNNShape(filterShape);
-  arm_compute::TensorShape bias_shape = util::fromVectorNNShape(biasShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-  arm_compute::PadStrideInfo conv_info = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-  CLUniqueTensor bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32));
-  CLUniqueTensor filter(arm_compute::TensorInfo(filter_shape, arm_compute::Format::F32));
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  auto conv_f = std::make_shared<arm_compute::CLConvolutionLayer>();
-
-  conv_f->configure(input.ptr(), filter.ptr(), bias.ptr(), output.ptr(), conv_info);
-
-  fns.emplace_back(conv_f);
-
-  util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns);
-
-  input.allocate();
-  output.allocate();
-  bias.allocate();
-  filter.allocate();
-
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
-  TensorAccess<WeightAccessor>(filter.ref(), filterData, filterShape);
-
-  nnfw::util::env::IntAccessor("CONV2D_VERBOSE").access(verbose);
-  if (verbose)
-  {
-    input.ref().map();
-    auto ifm_shape = nnfw::support::nnapi::feature::asFeatureShape(inputShape);
-    nnfw::support::nnapi::feature::Reader<float> nnapi_ifm_reader{ifm_shape, inputData};
-    nnfw::support::acl::feature::Reader<float> acl_ifm_reader{input.ptr()};
-
-    std::cout << "NNAPI IFM:" << std::endl;
-    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, nnapi_ifm_reader} << std::endl;
-
-    std::cout << "ARM Compute IFM:" << std::endl;
-    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, acl_ifm_reader} << std::endl;
-    input.ref().unmap();
-  }
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  arm_compute::CLScheduler::get().sync();
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Conv2D.test.cpp b/libs/kernel/acl/src/cl/Conv2D.test.cpp
deleted file mode 100644
index e34cdeea5..000000000
--- a/libs/kernel/acl/src/cl/Conv2D.test.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/Conv2D.h>
-
-// TODO: fix include path in CMakeFiles
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, convFloat32_3x3to1x1)
-{
-  float inputData[9];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float filterData[9];
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float biasData[1] = { 1.0 };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 10.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, convFloat32_3x3to3x3)
-{
-  float inputData[9];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float filterData[9];
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float biasData[1] = { 1.0 };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[9];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {
-    5.0f, 7.0f, 5.0f,
-    7.0f, 10.0f, 7.0f,
-    5.0f, 7.0f, 5.0f
-  };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, convFloat32_3x3to3x3_RELU)
-{
-  float inputData[9];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float filterData[9];
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float biasData[1] = { -5.0f };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[9];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] =
-  {
-    0.0f, 1.0f, 0.0f,
-    1.0f, 4.0f, 1.0f,
-    0.0f, 1.0f, 0.0f
-  };
-
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, convFloat32_3x5to3x3)
-{
-  float inputData[15] = {
-    1,2,3,4,5,
-    6,7,8,9,10,
-    11,12,13,14,15
-  };
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,5,1}, 1.0, 0 };
-  float filterData[18] = {
-    1,1,1, 1,1,1, 1,1,1,
-    2,2,2, 2,2,2, 2,2,2
-  };
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {2,3,3,1}, 1.0, 0 };
-  float biasData[2] = { 1.0, 1.0 };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {2}, 1.0, 0 };
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[30];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,5,2}, 1.0, 0 };
-  bool bret;
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectNCHW[] = {
-    17.0f, 28.0f, 34.0f, 40.0f, 29.0f,
-    40.0f, 64.0f, 73.0f, 82.0f, 58.0f,
-    37.0f, 58.0f, 64.0f, 70.0f, 49.0f,
-
-    33.0f, 55.0f, 67.0f, 79.0f, 57.0f,
-    79.0f, 127.0f, 145.0f, 163.0f, 115.0f,
-    73.0f, 115.0f, 127.0f, 139.0f, 97.0f
-  };
-  float expectData[30];
-  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
diff --git a/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp b/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp
deleted file mode 100644
index 7593a99f4..000000000
--- a/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-#include <cassert>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../CLUniqueTensor.h"
-#include "../DepthwiseConv2D.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-static void sync_scheduler() {
-  arm_compute::CLScheduler::get().sync();
-}
-
-bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                          const float* filterData, const nnfw::rt::Shape& filterShape,
-                          const float* biasData, const nnfw::rt::Shape& biasShape,
-                          int32_t padding_left, int32_t padding_right,
-                          int32_t padding_top, int32_t padding_bottom,
-                          int32_t stride_width, int32_t stride_height,
-                          int32_t depth_multiplier, int32_t activation,
-                          float* outputData, const nnfw::rt::Shape& outputShape) {
-  return common::depthwiseConvFloat32<CLUniqueTensor, arm_compute::CLDepthwiseConvolutionLayer,
-  arm_compute::CLActivationLayer>(inputData, inputShape,
-      filterData, filterShape,
-      biasData, biasShape,
-      padding_left, padding_right,
-      padding_top, padding_bottom,
-      stride_width, stride_height,
-      depth_multiplier, activation,
-      outputData, outputShape,
-      sync_scheduler);
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
diff --git a/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp b/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp
deleted file mode 100644
index 695563383..000000000
--- a/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ACL_CORE_FUNC_NAME depthwiseConvFloat32
-#define ACL_TEST(tc, t) TEST(tc, cl_##t)
-
-#include "../DepthwiseConv2D.test.h"
diff --git a/libs/kernel/acl/src/cl/FullyConnected.cpp b/libs/kernel/acl/src/cl/FullyConnected.cpp
deleted file mode 100644
index 7513355ab..000000000
--- a/libs/kernel/acl/src/cl/FullyConnected.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-#include <cassert>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../CLUniqueTensor.h"
-#include "../FullyConnected.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-void sync_scheduler() {
-  arm_compute::CLScheduler::get().sync();
-}
-
-bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
-                           const float* biasData, const nnfw::rt::Shape& biasShape,
-                           int32_t activation,
-                           float* outputData, const nnfw::rt::Shape& outputShape) {
-  return common::fullyConnectedFloat32<CLUniqueTensor, arm_compute::CLFullyConnectedLayer,
-  arm_compute::CLActivationLayer>(inputData, inputShape,
-      weightsData, weightsShape,
-      biasData, biasShape,
-      activation,
-      outputData, outputShape,
-      sync_scheduler);
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/FullyConnected.test.cpp b/libs/kernel/acl/src/cl/FullyConnected.test.cpp
deleted file mode 100644
index b1f5a095f..000000000
--- a/libs/kernel/acl/src/cl/FullyConnected.test.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ACL_CORE_FUNC_NAME fullyConnectedFloat32
-#define ACL_TEST(tc, t) TEST(tc, cl_##t)
-
-#include "../FullyConnected.test.h"
diff --git a/libs/kernel/acl/src/cl/Pooling.cpp b/libs/kernel/acl/src/cl/Pooling.cpp
deleted file mode 100644
index e22eacccc..000000000
--- a/libs/kernel/acl/src/cl/Pooling.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../CLUniqueTensor.h"
-
-#include <cassert>
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-bool maxPoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                 int32_t padding_left, int32_t padding_right,
-                 int32_t padding_top, int32_t padding_bottom,
-                 int32_t stride_width, int32_t stride_height,
-                 int32_t filter_width, int32_t filter_height,
-                 int32_t activation,
-                 float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  arm_compute::PoolingLayerInfo maxpool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX,
-                                                        arm_compute::Size2D(filter_width,filter_height),
-                                                        pad_info, false);
-
-  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  auto pool_f = std::make_shared<arm_compute::CLPoolingLayer>();
-  pool_f->configure(input.ptr(), output.ptr(), maxpool_info);
-
-  fns.emplace_back(pool_f);
-
-  input.allocate();
-  output.allocate();
-
-  util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns);
-
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  arm_compute::CLScheduler::get().sync();
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-bool averagePoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                 int32_t padding_left, int32_t padding_right,
-                 int32_t padding_top, int32_t padding_bottom,
-                 int32_t stride_width, int32_t stride_height,
-                 int32_t filter_width, int32_t filter_height,
-                 int32_t activation,
-                 float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  arm_compute::PoolingLayerInfo pool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG,
-                                                        arm_compute::Size2D(filter_width,filter_height),
-                                                        pad_info, true);
-
-  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  auto pool_f = std::make_shared<arm_compute::CLPoolingLayer>();
-  pool_f->configure(input.ptr(), output.ptr(), pool_info);
-
-  fns.emplace_back(pool_f);
-
-  input.allocate();
-  output.allocate();
-
-  util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns);
-
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  arm_compute::CLScheduler::get().sync();
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Pooling.test.cpp b/libs/kernel/acl/src/cl/Pooling.test.cpp
deleted file mode 100644
index 8112e7a45..000000000
--- a/libs/kernel/acl/src/cl/Pooling.test.cpp
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <arm_compute/core/Types.h>
-#include <kernel/acl/Pooling.h>
-
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, maxPoolFloat32_3x3to1x1)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = maxPoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 9.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, maxPoolFloat32_3x3to1x1_RELU)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = -1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value--;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
-
-  bret = maxPoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, maxPoolFloat32_3x3to2x2)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 1;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 2;
-  int32_t filter_height = 2;
-
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = maxPoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {
-    5.0f, 6.0f,
-    8.0f, 9.0f
-  };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, maxPoolFloat32_147x147to73x73)
-{
-  util::TensorWrapper input({1,147,147,64});
-  util::TensorWrapper output({1,73,73,64});
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = maxPoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,73,73,64});
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, maxPoolFloat32_71x71to35x35)
-{
-  util::TensorWrapper input({1,71,71,192});
-  util::TensorWrapper output({1,35,35,192});
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = maxPoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,35,35,192});
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, averagePoolFloat32_3x3to1x1)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = averagePoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 5.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, averagePoolFloat32_3x3to1x1_RELU)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 3.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value--;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
-
-  bret = averagePoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, averagePoolFloat32_3x3to2x2)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 2;
-  int32_t filter_height = 2;
-
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = averagePoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {
-    3.0f, 4.0f,
-    6.0f, 7.0f
-  };  
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, averagePoolFloat32_3x3to3x3)
-{
-  std::vector<uint32_t> dims = {1,3,3,1};
-  util::TensorWrapper input(dims);
-  util::TensorWrapper output(dims);
-
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  int32_t value=1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected(dims);
-  float v=2.5f;
-  expected.initValue([&v](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    v = v + 0.5f;
-    return v;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, averagePoolFloat32_35x35to35x35)
-{
-  int32_t N=35;
-  std::vector<uint32_t> dims = {1,35,35,768};
-  util::TensorWrapper input(dims);
-  util::TensorWrapper output(dims);
-
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected(dims);
-  expected.initValue([&N](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-      return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, averagePoolFloat32_8x8to1x1)
-{
-  util::TensorWrapper input({1,8,8,2048});
-  util::TensorWrapper output({1,1,1,2048});
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 8;
-  int32_t filter_height = 8;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,1,1,2048});
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
diff --git a/libs/kernel/acl/src/cl/Reshape.cpp b/libs/kernel/acl/src/cl/Reshape.cpp
deleted file mode 100644
index e420ab92b..000000000
--- a/libs/kernel/acl/src/cl/Reshape.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../CLUniqueTensor.h"
-#include "../Reshape.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-static void sync_scheduler() {
-  arm_compute::CLScheduler::get().sync();
-}
-
-bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape,
-                    void* outputData, const nnfw::rt::Shape& outputShape) {
-  return common::reshapeGeneric<CLUniqueTensor, arm_compute::CLReshapeLayer>
-    (inputData, inputShape, outputData, outputShape, sync_scheduler);
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Reshape.test.cpp b/libs/kernel/acl/src/cl/Reshape.test.cpp
deleted file mode 100644
index db23a6d3d..000000000
--- a/libs/kernel/acl/src/cl/Reshape.test.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ACL_CORE_FUNC_NAME reshapeGeneric
-#define ACL_TEST(tc, t) TEST(tc, cl_##t)
-
-#include "../Reshape.test.h"
diff --git a/libs/kernel/acl/src/cl/Softmax.cpp b/libs/kernel/acl/src/cl/Softmax.cpp
deleted file mode 100644
index a628f05fe..000000000
--- a/libs/kernel/acl/src/cl/Softmax.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <NeuralNetworks.h>
-
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../CLUniqueTensor.h"
-#include "../util.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-bool softmaxFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                    const float beta,
-                    float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-
-  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  auto softmax_f = std::make_shared<arm_compute::CLSoftmaxLayer>();
-  softmax_f->configure(input.ptr(), output.ptr(), beta);
-
-  input.allocate();
-  output.allocate();
-
-  if (inputShape.dimensions.size() == 4)
-  {
-    TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-
-    softmax_f->run();
-
-    arm_compute::CLScheduler::get().sync();
-
-    TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-  }
-  else if (inputShape.dimensions.size() == 2)
-  {
-    TensorAccess<MatrixInputAccessor>(input.ref(), inputData, inputShape);
-
-    softmax_f->run();
-
-    arm_compute::CLScheduler::get().sync();
-
-    TensorAccess<MatrixOutputAccessor>(output.ref(), outputData, outputShape);
-  }
-  else
-  {
-    assert("undefined dimension of input" && 0);
-    return false;
-  }
-
-  return true;
-}
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Softmax.test.cpp b/libs/kernel/acl/src/cl/Softmax.test.cpp
deleted file mode 100644
index 8ee8b41e2..000000000
--- a/libs/kernel/acl/src/cl/Softmax.test.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <arm_compute/core/Types.h>
-#include <kernel/acl/Softmax.h>
-
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, softmaxFloat32_1xn)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, softmaxFloat32_4d)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, softmaxFloat32_1xn_seq)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, softmaxFloat32_4d_seq)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
diff --git a/libs/kernel/acl/src/gtest_env.cpp b/libs/kernel/acl/src/gtest_env.cpp
deleted file mode 100644
index f6fc52f7a..000000000
--- a/libs/kernel/acl/src/gtest_env.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-
-class TestEnvironment : public ::testing::Environment
-{
-public:
-  virtual ~TestEnvironment() = default;
-
-  virtual void SetUp()
-  {
-    nnfw::kernel::acl::Initialize();
-  }
-
-  virtual void TearDown()
-  {
-    // DO NOTHING
-  }
-};
-
-static ::testing::Environment* const testingenv =
-  ::testing::AddGlobalTestEnvironment(new TestEnvironment);
diff --git a/libs/kernel/acl/src/neon/Concatenation.cpp b/libs/kernel/acl/src/neon/Concatenation.cpp
deleted file mode 100644
index 8738a9d12..000000000
--- a/libs/kernel/acl/src/neon/Concatenation.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-#include <cassert>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace neon {
-
-bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
-                          const std::vector<nnfw::rt::Shape>& inputShapes, int32_t axis,
-                          float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  if (axis != 3)
-  {
-    assert("Only support axis=3 for ACL" && 0);
-    return false;
-  }
-  assert(inputDataPtrs.size() == inputShapes.size());
-
-  std::vector<arm_compute::Tensor*> inputPtrs;
-  std::vector<arm_compute::ITensor*> inputIptrs;
-  arm_compute::Tensor output;
-
-  // init Tensors
-  std::vector<nnfw::rt::Shape>::const_iterator it_inputShape = inputShapes.begin();
-  for (auto inputData : inputDataPtrs)
-  {
-    const nnfw::rt::Shape& inputShape = *it_inputShape;
-    arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-    arm_compute::Tensor* inputPtr = new arm_compute::Tensor();
-
-    inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-    inputPtrs.push_back(inputPtr);
-    inputIptrs.push_back(inputPtr);
-
-    it_inputShape++;
-  }
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-  output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  // prepare ACL Concatenate and configure tensors
-  auto concat = std::make_shared<arm_compute::NEDepthConcatenateLayer>();
-  concat->configure(inputIptrs, &output);
-
-  // allocate Tensors
-  it_inputShape = inputShapes.begin();
-  std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin();
-  for (auto inputPtr : inputPtrs)
-  {
-    inputPtr->allocator()->allocate();
-
-    const float* inputData = *it_inputData;
-    const nnfw::rt::Shape& inputShape = *it_inputShape;
-
-    TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape);
-
-    it_inputShape++;
-    it_inputData++;
-  }
-  output.allocator()->allocate();
-
-  // run
-  concat->run();
-
-  // get output
-  TensorAccess<OutputAccessor>(output, outputData, outputShape);
-
-  // cleanup
-  for (auto inputPtr : inputPtrs)
-  {
-    inputPtr->allocator()->free();
-    delete inputPtr;
-  }
-  output.allocator()->free();
-
-  return true;
-}
-
-} // namespace neon
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Concatenation.test.cpp b/libs/kernel/acl/src/neon/Concatenation.test.cpp
deleted file mode 100644
index 03b05bd24..000000000
--- a/libs/kernel/acl/src/neon/Concatenation.test.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/Concatenation.h>
-
-// TODO: fix include path in CMakeFiles
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, neon_concatFloat32_1)
-{
-  float inputData_1[6] = {
-    1, 2, 3, 4, 5, 6      // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ]
-  };
-  float inputData_2[6] = {
-    7, 8, 9, 10, 11, 12   // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ]
-  };
-  const nnfw::rt::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
-  const nnfw::rt::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
-  std::vector<const float*> inputDataPtrs;
-  std::vector<nnfw::rt::Shape> inputShapes;
-  float outputData[12];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 };
-  bool bret;
-
-  inputDataPtrs.push_back(inputData_1);
-  inputDataPtrs.push_back(inputData_2);
-  inputShapes.push_back(inputShape_1);
-  inputShapes.push_back(inputShape_2);
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::concatenationFloat32(inputDataPtrs, inputShapes, 3, 
-                                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectNCHW[] = {
-    1, 2, 3, 4, 5, 6,
-    7, 8, 9, 10, 11, 12
-  };
-  float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ]
-  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
diff --git a/libs/kernel/acl/src/neon/Conv2D.cpp b/libs/kernel/acl/src/neon/Conv2D.cpp
deleted file mode 100644
index 679ecfced..000000000
--- a/libs/kernel/acl/src/neon/Conv2D.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <NeuralNetworks.h>
-
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-#include <util/environment.h>
-
-#include "../IO_accessor.h"
-#include "../util.h"
-#include "../shape.h"
-#include "../NEUniqueTensor.h"
-#include "../support.h"
-
-#include "util/feature/TextFormatter.h"
-
-#include "support/nnapi/feature/Reader.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace neon {
-
-static int verbose = 0;
-
-bool convFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                 const float* filterData, const nnfw::rt::Shape& filterShape,
-                 const float* biasData, const nnfw::rt::Shape& biasShape,
-                 int32_t padding_left, int32_t padding_right,
-                 int32_t padding_top, int32_t padding_bottom,
-                 int32_t stride_width, int32_t stride_height,
-                 int32_t activation,
-                 float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape filter_shape = util::fromNNShape(filterShape);
-  arm_compute::TensorShape bias_shape = util::fromVectorNNShape(biasShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-  arm_compute::PadStrideInfo conv_info = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-  NEUniqueTensor bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32));
-  NEUniqueTensor filter(arm_compute::TensorInfo(filter_shape, arm_compute::Format::F32));
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  auto conv_f = std::make_shared<arm_compute::NEConvolutionLayer>();
-
-  conv_f->configure(input.ptr(), filter.ptr(), bias.ptr(), output.ptr(), conv_info);
-
-  fns.emplace_back(conv_f);
-
-  util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns);
-
-  input.allocate();
-  output.allocate();
-  bias.allocate();
-  filter.allocate();
-
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
-  TensorAccess<WeightAccessor>(filter.ref(), filterData, filterShape);
-
-  nnfw::util::env::IntAccessor("CONV2D_VERBOSE").access(verbose);
-  if (verbose)
-  {
-    auto ifm_shape = nnfw::support::nnapi::feature::asFeatureShape(inputShape);
-    nnfw::support::nnapi::feature::Reader<float> nnapi_ifm_reader{ifm_shape, inputData};
-    nnfw::support::acl::feature::Reader<float> acl_ifm_reader{ input.ptr() };
-
-    std::cout << "NNAPI IFM:" << std::endl;
-    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, nnapi_ifm_reader} << std::endl;
-
-    std::cout << "ARM Compute IFM:" << std::endl;
-    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, acl_ifm_reader} << std::endl;
-  }
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-} // namespace neon
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Conv2D.test.cpp b/libs/kernel/acl/src/neon/Conv2D.test.cpp
deleted file mode 100644
index 6a3de1c43..000000000
--- a/libs/kernel/acl/src/neon/Conv2D.test.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <kernel/acl/Conv2D.h>
-
-// TODO: fix include path in CMakeFiles
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, neon_convFloat32_3x3to1x1)
-{
-  float inputData[9];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float filterData[9];
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float biasData[1] = { 1.0 };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 10.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_convFloat32_3x3to3x3)
-{
-  float inputData[9];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float filterData[9];
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float biasData[1] = { 1.0 };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[9];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {
-    5.0f, 7.0f, 5.0f,
-    7.0f, 10.0f, 7.0f,
-    5.0f, 7.0f, 5.0f
-  };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_convFloat32_3x3to3x3_RELU)
-{
-  float inputData[9];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float filterData[9];
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  float biasData[1] = { -5.0f };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[9];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] =
-  {
-    0.0f, 1.0f, 0.0f,
-    1.0f, 4.0f, 1.0f,
-    0.0f, 1.0f, 0.0f
-  };
-
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_convFloat32_3x5to3x3)
-{
-  float inputData[15] = {
-    1,2,3,4,5,
-    6,7,8,9,10,
-    11,12,13,14,15
-  };
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,5,1}, 1.0, 0 };
-  float filterData[18] = {
-    1,1,1, 1,1,1, 1,1,1,
-    2,2,2, 2,2,2, 2,2,2
-  };
-  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {2,3,3,1}, 1.0, 0 };
-  float biasData[2] = { 1.0, 1.0 };
-  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {2}, 1.0, 0 };
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
-  float outputData[30];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,5,2}, 1.0, 0 };
-  bool bret;
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::convFloat32(inputData, inputShape,
-                     filterData, filterShape,
-                     biasData, biasShape,
-                     padding_left, padding_right,
-                     padding_top, padding_bottom,
-                     stride_width, stride_height,
-                     activation,
-                     outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectNCHW[] = {
-    17.0f, 28.0f, 34.0f, 40.0f, 29.0f,
-    40.0f, 64.0f, 73.0f, 82.0f, 58.0f,
-    37.0f, 58.0f, 64.0f, 70.0f, 49.0f,
-
-    33.0f, 55.0f, 67.0f, 79.0f, 57.0f,
-    79.0f, 127.0f, 145.0f, 163.0f, 115.0f,
-    73.0f, 115.0f, 127.0f, 139.0f, 97.0f
-  };
-  float expectData[30];
-  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
diff --git a/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp b/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp
deleted file mode 100644
index bcf56c667..000000000
--- a/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/runtime/NEON/NEScheduler.h>
-
-#include <cassert>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../NEUniqueTensor.h"
-#include "../DepthwiseConv2D.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-namespace neon {
-static void sync_scheduler() {
-}
-
-bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                          const float* filterData, const nnfw::rt::Shape& filterShape,
-                          const float* biasData, const nnfw::rt::Shape& biasShape,
-                          int32_t padding_left, int32_t padding_right,
-                          int32_t padding_top, int32_t padding_bottom,
-                          int32_t stride_width, int32_t stride_height,
-                          int32_t depth_multiplier, int32_t activation,
-                          float* outputData, const nnfw::rt::Shape& outputShape) {
-  return common::depthwiseConvFloat32<NEUniqueTensor, arm_compute::NEDepthwiseConvolutionLayer,
-  arm_compute::NEActivationLayer>(inputData, inputShape,
-      filterData, filterShape,
-      biasData, biasShape,
-      padding_left, padding_right,
-      padding_top, padding_bottom,
-      stride_width, stride_height,
-      depth_multiplier, activation,
-      outputData, outputShape,
-      sync_scheduler);
-}
-
-} // namespace neon
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/FullyConnected.cpp b/libs/kernel/acl/src/neon/FullyConnected.cpp
deleted file mode 100644
index 86229cbf2..000000000
--- a/libs/kernel/acl/src/neon/FullyConnected.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/runtime/NEON/NEScheduler.h>
-
-#include <cassert>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../NEUniqueTensor.h"
-#include "../FullyConnected.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-namespace neon {
-
-void sync_scheduler() {
-}
-
-bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
-                           const float* biasData, const nnfw::rt::Shape& biasShape,
-                           int32_t activation,
-                           float* outputData, const nnfw::rt::Shape& outputShape) {
-
-  return common::fullyConnectedFloat32<NEUniqueTensor, arm_compute::NEFullyConnectedLayer,
-            arm_compute::NEActivationLayer>(inputData, inputShape,
-      weightsData, weightsShape,
-      biasData, biasShape,
-      activation,
-      outputData, outputShape,
-      sync_scheduler);
-}
-
-} // namespace neon
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
diff --git a/libs/kernel/acl/src/neon/FullyConnected.test.cpp b/libs/kernel/acl/src/neon/FullyConnected.test.cpp
deleted file mode 100644
index d4c95e4cb..000000000
--- a/libs/kernel/acl/src/neon/FullyConnected.test.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ACL_CORE_FUNC_NAME neon::fullyConnectedFloat32
-#define ACL_TEST(tc, t) TEST(tc, neon_##t)
-
-#include "../FullyConnected.test.h"
-
diff --git a/libs/kernel/acl/src/neon/Pooling.cpp b/libs/kernel/acl/src/neon/Pooling.cpp
deleted file mode 100644
index 5c58ae0b5..000000000
--- a/libs/kernel/acl/src/neon/Pooling.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../NEUniqueTensor.h"
-
-#include <cassert>
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace neon {
-
-bool maxPoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                 int32_t padding_left, int32_t padding_right,
-                 int32_t padding_top, int32_t padding_bottom,
-                 int32_t stride_width, int32_t stride_height,
-                 int32_t filter_width, int32_t filter_height,
-                 int32_t activation,
-                 float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  arm_compute::PoolingLayerInfo maxpool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX,
-                                                        arm_compute::Size2D(filter_width,filter_height),
-                                                        pad_info, false);
-
-  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  auto pool_f = std::make_shared<arm_compute::NEPoolingLayer>();
-  pool_f->configure(input.ptr(), output.ptr(), maxpool_info);
-
-  fns.emplace_back(pool_f);
-
-  util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns);
-
-  input.allocate();
-  output.allocate();
-
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-bool averagePoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                 int32_t padding_left, int32_t padding_right,
-                 int32_t padding_top, int32_t padding_bottom,
-                 int32_t stride_width, int32_t stride_height,
-                 int32_t filter_width, int32_t filter_height,
-                 int32_t activation,
-                 float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-
-  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
-
-  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
-                                              padding_left, padding_right,
-                                              padding_top, padding_bottom,
-                                              arm_compute::DimensionRoundingType::FLOOR);
-
-  arm_compute::PoolingLayerInfo pool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG,
-                                                        arm_compute::Size2D(filter_width,filter_height),
-                                                        pad_info, true);
-
-  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  auto pool_f = std::make_shared<arm_compute::NEPoolingLayer>();
-  pool_f->configure(input.ptr(), output.ptr(), pool_info);
-
-  fns.emplace_back(pool_f);
-
-  util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns);
-
-  input.allocate();
-  output.allocate();
-
-  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-
-  for (const auto &fn : fns)
-  {
-    fn->run();
-  }
-
-  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-
-  return true;
-}
-
-} // namespace neon
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Pooling.test.cpp b/libs/kernel/acl/src/neon/Pooling.test.cpp
deleted file mode 100644
index 4e6593921..000000000
--- a/libs/kernel/acl/src/neon/Pooling.test.cpp
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <arm_compute/core/Types.h>
-#include <kernel/acl/Pooling.h>
-
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to1x1)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 9.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to1x1_RELU)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = -1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value--;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
-
-  bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to2x2)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 1;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 2;
-  int32_t filter_height = 2;
-
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {
-    5.0f, 6.0f,
-    8.0f, 9.0f
-  };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_maxPoolFloat32_147x147to73x73)
-{
-  util::TensorWrapper input({1,147,147,64});
-  util::TensorWrapper output({1,73,73,64});
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = neon::maxPoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,73,73,64});
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, neon_maxPoolFloat32_71x71to35x35)
-{
-  util::TensorWrapper input({1,71,71,192});
-  util::TensorWrapper output({1,35,35,192});
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = neon::maxPoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,35,35,192});
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to1x1)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 5.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to1x1_RELU)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  float outputData[1];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 3.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value--;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
-
-  bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.0f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to2x2)
-{
-  util::TensorWrapper input({1,3,3,1});
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 2;
-  int32_t filter_height = 2;
-
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
-  bool bret;
-
-  float value = 1.0f;
-  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return value++;
-  });
-
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape,
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {
-    3.0f, 4.0f,
-    6.0f, 7.0f
-  };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_averagePoolFloat32_35x35to35x35)
-{
-  std::vector<uint32_t> dims = {1,35,35,192};
-  util::TensorWrapper input(dims);
-  util::TensorWrapper output(dims);
-
-  int32_t padding_left = 1;
-  int32_t padding_right = 1;
-  int32_t padding_top = 1;
-  int32_t padding_bottom = 1;
-  int32_t stride_width = 1;
-  int32_t stride_height = 1;
-  int32_t filter_width = 3;
-  int32_t filter_height = 3;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = neon::averagePoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected(dims);
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-      return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
-
-TEST(KernelACL_TC, neon_averagePoolFloat32_8x8to1x1)
-{
-  util::TensorWrapper input({1,8,8,2048});
-  util::TensorWrapper output({1,1,1,2048});
-
-  int32_t padding_left = 0;
-  int32_t padding_right = 0;
-  int32_t padding_top = 0;
-  int32_t padding_bottom = 0;
-  int32_t stride_width = 2;
-  int32_t stride_height = 2;
-  int32_t filter_width = 8;
-  int32_t filter_height = 8;
-
-  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 0.f;
-  });
-
-  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
-
-  bool bret = neon::averagePoolFloat32(input.ptr<float>(), input.shape(),
-                    padding_left, padding_right,
-                    padding_top, padding_bottom,
-                    stride_width, stride_height,
-                    filter_width, filter_height,
-                    activation,
-                    output.ptr<float>(), output.shape());
-  EXPECT_EQ(bret, true);
-
-  util::TensorWrapper expected({1,1,1,2048});
-  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
-    return 1.0f;
-  });
-
-  EXPECT_EQ(output, expected);
-}
diff --git a/libs/kernel/acl/src/neon/Reshape.cpp b/libs/kernel/acl/src/neon/Reshape.cpp
deleted file mode 100644
index cef84c7f3..000000000
--- a/libs/kernel/acl/src/neon/Reshape.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-
-// TODO: fix include path in CMakeFiles
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../NEUniqueTensor.h"
-#include "../Reshape.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-
-namespace neon {
-
-static void sync_scheduler() {
-  arm_compute::CLScheduler::get().sync();
-}
-
-bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape,
-                    void* outputData, const nnfw::rt::Shape& outputShape) {
-  return common::reshapeGeneric<NEUniqueTensor, arm_compute::NEReshapeLayer>
-    (inputData, inputShape, outputData, outputShape, sync_scheduler);
-}
-
-} // namespace neon
-
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
diff --git a/libs/kernel/acl/src/neon/Reshape.test.cpp b/libs/kernel/acl/src/neon/Reshape.test.cpp
deleted file mode 100644
index 9aca45e7e..000000000
--- a/libs/kernel/acl/src/neon/Reshape.test.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ACL_CORE_FUNC_NAME neon::reshapeGeneric
-#define ACL_TEST(tc, t) TEST(tc, neon_##t)
-
-#include "../Reshape.test.h"
diff --git a/libs/kernel/acl/src/neon/Softmax.cpp b/libs/kernel/acl/src/neon/Softmax.cpp
deleted file mode 100644
index 79d614418..000000000
--- a/libs/kernel/acl/src/neon/Softmax.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <OperationsUtils.h>
-#include <NeuralNetworks.h>
-
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include "../IO_accessor.h"
-#include "../shape.h"
-#include "../util.h"
-#include "../NEUniqueTensor.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace neon {
-
-bool softmaxFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
-                    const float beta,
-                    float* outputData, const nnfw::rt::Shape& outputShape)
-{
-  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
-  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
-
-  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
-  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
-
-  auto softmax_f = std::make_shared<arm_compute::NESoftmaxLayer>();
-  softmax_f->configure(input.ptr(), output.ptr(), beta);
-
-  input.allocate();
-  output.allocate();
-
-  if (inputShape.dimensions.size() == 4)
-  {
-    TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
-
-    softmax_f->run();
-
-    TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
-  }
-  else if (inputShape.dimensions.size() == 2)
-  {
-    // Softmax comes with 1xN matrix and this is translated to N vector in arm_compute::TensorShape
-    TensorAccess<VectorInputAccessor>(input.ref(), inputData, inputShape);
-
-    softmax_f->run();
-
-    TensorAccess<VectorOutputAccessor>(output.ref(), outputData, outputShape);
-  }
-  else
-  {
-    assert("undefined dimension of input" && 0);
-    return false;
-  }
-
-  return true;
-}
-
-} // namespace neon
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Softmax.test.cpp b/libs/kernel/acl/src/neon/Softmax.test.cpp
deleted file mode 100644
index 988f55078..000000000
--- a/libs/kernel/acl/src/neon/Softmax.test.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <OperationsUtils.h>
-#include <kernel/acl/nnfw_kernel_acl.h>
-#include <arm_compute/core/Types.h>
-#include <kernel/acl/Softmax.h>
-
-#include "../util.h"
-
-using namespace nnfw::kernel::acl;
-
-TEST(KernelACL_TC, neon_softmaxFloat32_1xn)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_softmaxFloat32_4d)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_softmaxFloat32_1xn_seq)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
-
-TEST(KernelACL_TC, neon_softmaxFloat32_4d_seq)
-{
-  float inputData[4];
-  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  float outputData[4];
-  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
-  const float beta = 1.0f;
-  bool bret;
-
-  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
-  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
-
-  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
-  EXPECT_EQ(bret, true);
-
-  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
-  bret = util::compareData(outputData, expectData, outputShape);
-  EXPECT_EQ(bret, true);
-}
diff --git a/libs/kernel/acl/src/shape.cpp b/libs/kernel/acl/src/shape.cpp
deleted file mode 100644
index 3c976ae94..000000000
--- a/libs/kernel/acl/src/shape.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cassert>
-
-#include "shape.h"
-
-namespace nnfw {
-namespace rt {
-
-// TODO remove from this source and use it from runtime
-uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx) {
-    if (dimensionIdx >= shape.dimensions.size()) {
-        // TODO, log the error
-        return 0;
-    }
-    return shape.dimensions[dimensionIdx];
-}
-
-} // namespace rt
-} // namespace nnfw
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace util {
-
-arm_compute::TensorShape fromVectorNNShape(const nnfw::rt::Shape& shape)
-{
-  assert(shape.dimensions.size() == 1);
-
-  const uint32_t len = nnfw::rt::getSizeOfDimension(shape, 0);
-
-  return arm_compute::TensorShape(len);
-}
-
-arm_compute::TensorShape fromMatrixNNShape(const nnfw::rt::Shape& shape)
-{
-  assert(shape.dimensions.size() == 2);
-
-  const uint32_t n = nnfw::rt::getSizeOfDimension(shape, 0);
-  const uint32_t c = nnfw::rt::getSizeOfDimension(shape, 1);
-
-  return arm_compute::TensorShape(c, n);
-}
-
-arm_compute::TensorShape fromNNShape(const nnfw::rt::Shape& shape)
-{
-  if( shape.dimensions.size() == 1 )
-    return fromVectorNNShape(shape);
-  else if( shape.dimensions.size() == 2 )
-    return fromMatrixNNShape(shape);
-
-  // TODO: need to treat 3D tensors.
-
-  assert(shape.dimensions.size() == 4);
-
-  // NNAPI assumes the following ordering:
-  //
-  //  dim(0) -> N
-  //  dim(1) -> H
-  //  dim(2) -> W
-  //  dim(3) -> C
-  //
-  uint32_t c = nnfw::rt::getSizeOfDimension(shape, 3);
-  uint32_t h = nnfw::rt::getSizeOfDimension(shape, 1);
-  uint32_t w = nnfw::rt::getSizeOfDimension(shape, 2);
-  uint32_t n = nnfw::rt::getSizeOfDimension(shape, 0);
-
-  return arm_compute::TensorShape(w, h, c, n);
-}
-
-} // namespace util
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/shape.h b/libs/kernel/acl/src/shape.h
deleted file mode 100644
index 902115ebd..000000000
--- a/libs/kernel/acl/src/shape.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_SHAPE_H__
-#define __NNFW_KERNEL_ACL_SHAPE_H__
-
-#include <OperationsUtils.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/runtime/IFunction.h>
-#include <cassert>
-
-namespace nnfw {
-namespace rt {
-
-// TODO remove from this source and use it from runtime
-uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx);
-
-} // namespace rt
-} // namespace nnfw
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace util {
-
-arm_compute::TensorShape fromVectorNNShape(const nnfw::rt::Shape& shape);
-arm_compute::TensorShape fromNNShape(const nnfw::rt::Shape& shape);
-
-template<class TensorT, class ActT>
-void insertFusedActivationLayer(TensorT& out, int activation,
-    std::vector<std::shared_ptr<arm_compute::IFunction>>& fns) {
-  auto relu_f = std::make_shared<ActT>();
-
-  switch(activation) {
-  case ANEURALNETWORKS_FUSED_NONE:
-    // DO NOTHING
-    return;
-
-  case ANEURALNETWORKS_FUSED_RELU:
-    {
-      const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
-
-      // Do in-place update
-      relu_f->configure(out.ptr(), nullptr, relu_info);
-    }
-    break;
-
-  case ANEURALNETWORKS_FUSED_RELU1:
-    {
-      const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 1.f);
-
-      // Do in-place update
-      relu_f->configure(out.ptr(), nullptr, relu_info);
-    }
-    break;
-
-  case ANEURALNETWORKS_FUSED_RELU6:
-    {
-      const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f);
-
-      // Do in-place update
-      relu_f->configure(out.ptr(), nullptr, relu_info);
-    }
-    break;
-
-  default:
-    assert("Undefined activation type." && 0);
-    break;
-  }
-
-  fns.emplace_back(relu_f);
-}
-
-} // namespace util
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_ACL_SHAPE_H__
diff --git a/libs/kernel/acl/src/support.cpp b/libs/kernel/acl/src/support.cpp
deleted file mode 100644
index d04aef59e..000000000
--- a/libs/kernel/acl/src/support.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "support.h"
-
-namespace nnfw
-{
-namespace support
-{
-namespace nnapi
-{
-namespace feature
-{
-
-// TODO Extract this function as utility function
-// NOTE It is not a good design to access nnfw::rt::Shape nnfw_support_nnapi lib
-nnfw::util::feature::Shape asFeatureShape(const nnfw::rt::Shape& shape)
-{
-  // NNAPI assumes the following ordering:
-  //
-  //  dim(0) -> N
-  //  dim(1) -> H
-  //  dim(2) -> W
-  //  dim(3) -> C
-  //
-  int32_t c = nnfw::rt::getSizeOfDimension(shape, 3);
-  int32_t h = nnfw::rt::getSizeOfDimension(shape, 1);
-  int32_t w = nnfw::rt::getSizeOfDimension(shape, 2);
-
-  assert(nnfw::rt::getSizeOfDimension(shape, 0) == 1);
-
-  return nnfw::util::feature::Shape{c, h, w};
-}
-
-} // namespace feature
-} // namespace nnapi
-} // namespace support
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/support.h b/libs/kernel/acl/src/support.h
deleted file mode 100644
index 751d2c6cb..000000000
--- a/libs/kernel/acl/src/support.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_SUPPORT_H_TEMPORARY__
-#define __NNFW_KERNEL_SUPPORT_H_TEMPORARY__
-
-// NOTE these are not decided yet but need to be moved out from Conv2D
-//      to separate NEON implementation to it's folder
-// TODO move to some folder where it should be
-
-#include <cassert>
-
-#include "util/feature/Shape.h"
-
-#include <OperationsUtils.h>
-
-namespace nnfw
-{
-namespace support
-{
-namespace nnapi
-{
-namespace feature
-{
-
-// TODO Extract this function as utility function
-// NOTE It is not a good design to access nnfw::rt::Shape nnfw_support_nnapi lib
-nnfw::util::feature::Shape asFeatureShape(const nnfw::rt::Shape& shape);
-
-} // namespace feature
-} // namespace nnapi
-} // namespace support
-} // namespace nnfw
-
-#include <arm_compute/core/ITensor.h>
-
-#include "util/feature/Reader.h"
-
-namespace nnfw
-{
-namespace support
-{
-namespace acl
-{
-namespace feature
-{
-
-template<typename T> class Reader;
-
-template<> class Reader<float> final : public nnfw::util::feature::Reader<float>
-{
-public:
-  Reader(arm_compute::ITensor *tensor) : _tensor{tensor}
-  {
-    assert(tensor->info()->data_type() == arm_compute::DataType::F32);
-  }
-
-public:
-  float at(uint32_t ch, uint32_t row, uint32_t col) const override
-  {
-    return *ptr_to_element(ch, row, col);
-  }
-
-private:
-  float *ptr_to_element(uint32_t ch, uint32_t row, uint32_t col) const
-  {
-    // ARM Compute uses CHW ordering
-    return reinterpret_cast<float *>(_tensor->ptr_to_element(arm_compute::Coordinates{col, row, ch}));
-  }
-
-private:
-  arm_compute::ITensor *_tensor;
-};
-
-} // namespace feature
-} // namespace acl
-} // namespace support
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_SUPPORT_H_TEMPORARY__
diff --git a/libs/kernel/acl/src/util.cpp b/libs/kernel/acl/src/util.cpp
deleted file mode 100644
index 7e5df534e..000000000
--- a/libs/kernel/acl/src/util.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <util/fp32.h>
-
-#include "util.h"
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace util {
-
-void initData(float* data, int num, float value)
-{
-  for (int i = 0; i < num; i++) {
-    *(data + i) = value;
-  }
-}
-
-void initData_Increasing(float* data, int num, float value)
-{
-  for (int i = 0; i < num; i++) {
-    *(data + i) = value;
-    value++;
-  }
-}
-
-// compareData
-// return true if result == expected with the shape info,
-// otherwise false
-bool compareData(const float* result, const float* expected, const nnfw::rt::Shape& shape)
-{
-  if (shape.dimensions.size() == 4)
-  {
-  // TODO fix indentation
-  uint32_t height = nnfw::rt::getSizeOfDimension(shape, 1);
-  uint32_t width  = nnfw::rt::getSizeOfDimension(shape, 2);
-  uint32_t numitems = height * width;
-  for (int item = 0; item < numitems; item++) {
-    if (!::nnfw::util::fp32::epsilon_equal(*(result + item), *(expected + item), 1)) {
-      LOG(ERROR) << "compareData failed: result " << *(result + item)
-                 << ", expected " << *(expected + item)
-                 << ", diff " << ::nnfw::util::fp32::relative_diff(*(result + item), *(expected + item))
-                 << std::endl;
-      return false;
-    }
-  }
-  }
-  else if (shape.dimensions.size() == 2)
-  {
-    uint32_t height = nnfw::rt::getSizeOfDimension(shape, 0);
-    uint32_t width  = nnfw::rt::getSizeOfDimension(shape, 1);
-    uint32_t numitems = height * width;
-    for (int item = 0; item < numitems; item++) {
-      if (!::nnfw::util::fp32::epsilon_equal(*(result + item), *(expected + item), 1)) {
-        LOG(ERROR) << "compareData failed: result " << *(result + item)
-                   << ", expected " << *(expected + item)
-                   << ", diff " << ::nnfw::util::fp32::relative_diff(*(result + item), *(expected + item))
-                   << std::endl;
-        return false;
-      }
-    }
-  }
-  else
-  {
-    // TODO: add a handler for rank 1 and 3
-    LOG(ERROR) << "Unhandled shape: " << shape.dimensions.size() << std::endl;
-  }
-  return true;
-}
-
-void NCHW2NHWC(const float* nchw, float* nhwc, const nnfw::rt::Shape& shape)
-{
-  uint32_t N = nnfw::rt::getSizeOfDimension(shape, 0);
-  uint32_t H = nnfw::rt::getSizeOfDimension(shape, 1);
-  uint32_t W = nnfw::rt::getSizeOfDimension(shape, 2);
-  uint32_t C = nnfw::rt::getSizeOfDimension(shape, 3);
-
-  for (uint32_t n = 0; n < N; n++) {
-    for (uint32_t c = 0; c < C; c++) {
-      for (uint32_t h = 0; h < H; h++) {
-        for (uint32_t w = 0; w < W; w++) {
-          uint32_t soffset = w + (h * W) + (c * W * H) + (n * W * H * C);
-          uint32_t doffset = c + (w * C) + (h * C * W) + (n * C * W * H);
-          *(nhwc + doffset) = *(nchw + soffset);
-        }
-      }
-    }
-  }
-}
-
-} // namespace util
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
diff --git a/libs/kernel/acl/src/util.h b/libs/kernel/acl/src/util.h
deleted file mode 100644
index 48ed02783..000000000
--- a/libs/kernel/acl/src/util.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_KERNEL_ACL_UTIL_H__
-#define __NNFW_KERNEL_ACL_UTIL_H__
-#include <OperationsUtils.h>
-
-#include <cmath>
-#include <cassert>
-#include <functional>
-
-namespace nnfw {
-namespace kernel {
-namespace acl {
-namespace util {
-
-// TODO: make a separate module.
-class TensorWrapper {
-public:
-  TensorWrapper(std::vector<uint32_t> dims,
-      OperandType type = OperandType::FLOAT32,
-      float scale = 1.0,
-      int32_t offset = 0)
-  :_shape{type, dims, scale, offset}
-  {
-
-    // currently, we support only FLOAT32 for now.
-    assert( type == OperandType::FLOAT32);
-
-    uint32_t size_bytes = sizeof(float);
-
-    _num_elems = 1;
-    for( auto& d: dims ) {
-      _num_elems *= d;
-    }
-
-    _data = new uint8_t[_num_elems * size_bytes];
-  }
-
-  ~TensorWrapper() {
-    delete [] _data;
-  }
-
-  const nnfw::rt::Shape shape() const {
-    return _shape;
-  }
-
-  uint32_t num_elems() const { return _num_elems; }
-
-  template<class T>
-  T at(const uint32_t& idx) const {
-    return reinterpret_cast<T*>(_data)[idx];
-  }
-
-  template<class T>
-  T& at(const uint32_t& idx) {
-    return reinterpret_cast<T*>(_data)[idx];
-  }
-
-  template<class T>
-  T* ptr() { return reinterpret_cast<T*>(_data); }
-
-  void initValue(float f) {
-    for( uint32_t i = 0; i < _num_elems; ++i ) {
-      at<float>(i) = f;
-    }
-  }
-
-  typedef std::function<float(uint32_t n, uint32_t c, uint32_t h, uint32_t w)> funcInit4;
-  void initValue(funcInit4 f) {
-    assert(_shape.dimensions.size() == 4);
-
-    int N = _shape.dimensions[0];
-    int H = _shape.dimensions[1];
-    int W = _shape.dimensions[2];
-    int C = _shape.dimensions[3];
-
-    for(int n = 0; n < N; ++n) {
-      for(int h = 0; h < H; ++h) {
-        for(int w = 0; w < W; ++w) {
-          for(int c = 0; c < C; ++c) {
-            uint32_t offset = n*H*W*C + h*W*C + w*C + c;
-            at<float>(offset) = f(n,c,h,w);
-          }
-        }
-      }
-    }
-  }
-
-  typedef std::function<float(uint32_t c, uint32_t h, uint32_t w)> funcInit3;
-  void initValue(funcInit3 f) {
-    assert(_shape.dimensions.size() == 3);
-
-    int C = _shape.dimensions[0];
-    int H = _shape.dimensions[1];
-    int W = _shape.dimensions[2];
-
-    for(int h = 0; h < H; ++h) {
-      for(int w = 0; w < W; ++w) {
-        for(int c = 0; c < C; ++c) {
-          uint32_t offset = h*W*C + w*C + c;
-          at<float>(offset) = f(c,h,w);
-        }
-      }
-    }
-  }
-
-  typedef std::function<float(uint32_t h, uint32_t w)> funcInit2;
-  void initValue(funcInit2 f) {
-    assert(_shape.dimensions.size() == 2);
-
-    int H = _shape.dimensions[0];
-    int W = _shape.dimensions[1];
-
-    for(int h = 0; h < H; ++h) {
-      for(int w = 0; w < W; ++w) {
-        uint32_t offset = h*W + w;
-        at<float>(offset) = f(h,w);
-      }
-    }
-  }
-
-  typedef std::function<float(uint32_t w)> funcInit1;
-  void initValue(funcInit1 f) {
-    assert(_shape.dimensions.size() == 1);
-
-    int W = _shape.dimensions[0];
-
-    for(int w = 0; w < W; ++w) {
-      uint32_t offset = w;
-      at<float>(offset) = f(w);
-    }
-  }
-
-  void initValue(std::vector<float> v) {
-    assert(v.size() == _num_elems);
-    for( uint32_t i = 0; i < _num_elems; ++i ) {
-      at<float>(i) = v[i];
-    }
-  }
-
-  bool operator==(const TensorWrapper &t) const {
-    // compare the shape
-    assert(num_elems() == t.num_elems());
-    assert(_shape.type == t.shape().type);
-    assert(_shape.scale == t.shape().scale);
-    assert(_shape.offset == t.shape().offset);
-    assert(_shape.dimensions == t.shape().dimensions);
-
-    // currently, we support only FLOAT32.
-    assert(_shape.type == OperandType::FLOAT32);
-
-    for( uint32_t i = 0; i < _num_elems; ++i ) {
-      if( std::fabs(static_cast<float>(at<float>(i) - t.at<float>(i))) > 0.001f ) {
-        std::cout << "Comparing [" << i << "] " << at<float>(i) << "," << t.at<float>(i) << std::endl;
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-private:
-  nnfw::rt::Shape _shape;
-  uint32_t           _num_elems;
-  uint8_t*           _data;
-};
-
-void initData(float* data, int num, float value);
-bool compareData(const float* result, const float* expected, const nnfw::rt::Shape& shape);
-void initData_Increasing(float* data, int num, float value);
-
-void NCHW2NHWC(const float* nchw, float* nhwc, const nnfw::rt::Shape& shape);
-
-} // namespace util
-} // namespace acl
-} // namespace kernel
-} // namespace nnfw
-
-#endif // __NNFW_KERNEL_ACL_UTIL_H__
diff --git a/libs/support/nnapi/CMakeLists.txt b/libs/support/nnapi/CMakeLists.txt
index cd1f365cf..193bcbd4e 100644
--- a/libs/support/nnapi/CMakeLists.txt
+++ b/libs/support/nnapi/CMakeLists.txt
@@ -3,4 +3,4 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_library(nnfw_support_nnapi ${SOURCES})
 set_property(TARGET nnfw_support_nnapi PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(nnfw_support_nnapi PUBLIC ${CMAKE_SOURCE_DIR}/include)
-target_link_libraries(nnfw_support_nnapi nnfw_util)
+target_link_libraries(nnfw_support_nnapi static_nnfw_util)
diff --git a/libs/support/nnapi/src/Utils.cpp b/libs/support/nnapi/src/Utils.cpp
new file mode 100644
index 000000000..ae1076fd1
--- /dev/null
+++ b/libs/support/nnapi/src/Utils.cpp
@@ -0,0 +1,29 @@
+#include "support/nnapi/Utils.h"
+
+#include <cassert>
+
+namespace nnfw
+{
+namespace support
+{
+namespace nnapi
+{
+
+const char *to_string(const PaddingCode &code)
+{
+  assert((ANEURALNETWORKS_PADDING_SAME == code) || (ANEURALNETWORKS_PADDING_VALID == code));
+
+  switch (code)
+  {
+    case ANEURALNETWORKS_PADDING_SAME:
+      return "ANEURALNETWORKS_PADDING_SAME";
+    case ANEURALNETWORKS_PADDING_VALID:
+      return "ANEURALNETWORKS_PADDING_VALID";
+  }
+
+  return nullptr;
+}
+
+} // namespace nnapi
+} // namespace support
+} // namespace nnfw
diff --git a/libs/support/tflite/CMakeLists.txt b/libs/support/tflite/CMakeLists.txt
index cccc7de3d..667b3bc11 100644
--- a/libs/support/tflite/CMakeLists.txt
+++ b/libs/support/tflite/CMakeLists.txt
@@ -2,9 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(nnfw_support_tflite ${SOURCES})
+add_library(nnfw_support_tflite STATIC ${SOURCES})
+set_target_properties(nnfw_support_tflite PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(nnfw_support_tflite PUBLIC ${CMAKE_SOURCE_DIR}/include)
-target_link_libraries(nnfw_support_tflite nnfw_util tensorflow-lite ${LIB_PTHREAD} dl)
+target_link_libraries(nnfw_support_tflite tensorflow-lite ${LIB_PTHREAD} dl)
+target_link_libraries(nnfw_support_tflite static_nnfw_util)
 
 add_executable(nnfw_support_tflite_test_TensorView src/TensorView.test.cpp)
 target_link_libraries(nnfw_support_tflite_test_TensorView nnfw_support_tflite)
diff --git a/libs/support/tflite/src/Diff.cpp b/libs/support/tflite/src/Diff.cpp
index f382df2d6..e875571cb 100644
--- a/libs/support/tflite/src/Diff.cpp
+++ b/libs/support/tflite/src/Diff.cpp
@@ -15,24 +15,31 @@
  */
 
 #include "support/tflite/Diff.h"
+#include "support/tflite/nnapi_delegate.h"
 
 #include "util/fp32.h"
 
 #include "util/tensor/IndexIterator.h"
 #include "util/tensor/IndexFormatter.h"
 #include "util/tensor/Zipper.h"
+#include "util/tensor/Comparator.h"
+
+#include "util/environment.h"
 
 #include <iostream>
+#include <cassert>
 
-class DiffSummary : public TfLiteTensorComparator::Observer
+class DiffSummary : public nnfw::util::tensor::Comparator::Observer
 {
 public:
   DiffSummary()
-      : max_abs_diff_index(0), max_abs_diff_value{0.0f},
-        max_rel_diff_index(0), max_rel_diff_value{0.0f}
+      : max_abs_diff_index(0), max_abs_diff_expected{0.0f}, max_abs_diff_obtained{0.0f},
+        max_abs_diff_value{0.0f}, max_rel_diff_index(0), max_rel_diff_expected{0.0f},
+        max_rel_diff_obtained{0.0f}, max_rel_diff_value{0.0f}
   {
     // DO NOTHING
   }
+
 public:
   void notify(const nnfw::util::tensor::Index &index, float expected, float obtained) override;
 
@@ -71,170 +78,422 @@ void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected,
   }
 }
 
-std::vector<TfLiteTensorDiff>
-TfLiteTensorComparator::compare(const nnfw::support::tflite::TensorView<float> &expected,
-                                const nnfw::support::tflite::TensorView<float> &obtained,
-                                Observer *observer) const
+template <typename T>
+bool TfLiteInterpMatchApp::compareSingleTensorView(
+    const nnfw::support::tflite::TensorView<T> &expected,
+    const nnfw::support::tflite::TensorView<T> &obtained, int id) const
 {
-  std::vector<TfLiteTensorDiff> res;
-
+  std::vector<nnfw::util::tensor::Diff<T>> diffs;
   assert(expected.shape() == obtained.shape());
 
-  nnfw::util::tensor::zip(expected.shape(), expected, obtained) <<
-    [&] (const nnfw::util::tensor::Index &index, float expected_value, float obtained_value)
+  using nnfw::util::tensor::zip;
+  using nnfw::util::tensor::Index;
+
+  zip(expected.shape(), expected, obtained)
+      << [&](const Index &index, T expected_value, T obtained_value) {
+           if (expected_value != obtained_value)
+           {
+             diffs.emplace_back(index, expected_value, obtained_value);
+           }
+         };
+
+  // TODO Unify summary generation code
+  if (diffs.size() == 0)
   {
-    const auto relative_diff = nnfw::util::fp32::relative_diff(expected_value, obtained_value);
+    std::cout << "  Tensor #" << id << ": MATCHED" << std::endl;
+  }
+  else
+  {
+    std::cout << "  Tensor #" << id << ": UNMATCHED" << std::endl;
+    std::cout << "    " << diffs.size() << " diffs are detected" << std::endl;
+  }
 
-    if (!_compare_fn(expected_value, obtained_value))
+  if (diffs.size() > 0 && _verbose != 0)
+  {
+    std::cout << "    ---- Details ---" << std::endl;
+    for (const auto &diff : diffs)
     {
-      TfLiteTensorDiff diff(index);
+      std::cout << "    Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]"
+                << std::endl;
+      std::cout << "      expected: " << diff.expected << std::endl;
+      std::cout << "      obtained: " << diff.obtained << std::endl;
+    }
+  }
 
-      diff.expected = expected_value;
-      diff.obtained = obtained_value;
+  return diffs.size() == 0;
+}
 
-      res.emplace_back(diff);
-    }
+template <>
+bool TfLiteInterpMatchApp::compareSingleTensorView<float>(
+    const nnfw::support::tflite::TensorView<float> &expected,
+    const nnfw::support::tflite::TensorView<float> &obtained, int id) const
+{
+  DiffSummary summary;
 
-    // Update max_diff_index, if necessary
-    if (observer != nullptr)
+  assert(expected.shape() == obtained.shape());
+  auto diffs = _comparator.compare(expected.shape(), expected, obtained, &summary);
+
+  // TODO Unify summary generation code
+  if (diffs.size() == 0)
+  {
+    std::cout << "  Tensor #" << id << ": MATCHED" << std::endl;
+  }
+  else
+  {
+    std::cout << "  Tensor #" << id << ": UNMATCHED" << std::endl;
+    std::cout << "    " << diffs.size() << " diffs are detected" << std::endl;
+  }
+
+  // Print out max_diff
+  if (summary.max_abs_diff_value > 0)
+  {
+    std::cout << "    Max absolute diff at ["
+              << nnfw::util::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl;
+    std::cout << "       expected: " << summary.max_abs_diff_expected << std::endl;
+    std::cout << "       obtained: " << summary.max_abs_diff_obtained << std::endl;
+    std::cout << "       absolute diff: " << summary.max_abs_diff_value << std::endl;
+  }
+
+  if (summary.max_rel_diff_value > 0)
+  {
+    const auto tolerance_level = summary.max_rel_diff_value / FLT_EPSILON;
+
+    std::cout << "    Max relative diff at ["
+              << nnfw::util::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl;
+    std::cout << "       expected: " << summary.max_rel_diff_expected << std::endl;
+    std::cout << "       obtained: " << summary.max_rel_diff_obtained << std::endl;
+    std::cout << "       relative diff: " << summary.max_rel_diff_value << std::endl;
+    std::cout << "         (tolerance level = " << tolerance_level << ")" << std::endl;
+  }
+
+  if (diffs.size() > 0)
+  {
+    if (_verbose != 0)
     {
-      observer->notify(index, expected_value, obtained_value);
+      std::cout << "    ---- Details ---" << std::endl;
+      for (const auto &diff : diffs)
+      {
+        const auto absolute_diff = std::fabs(diff.expected - diff.obtained);
+        const auto relative_diff = nnfw::util::fp32::relative_diff(diff.expected, diff.obtained);
+        const auto tolerance_level = relative_diff / FLT_EPSILON;
+
+        std::cout << "    Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]"
+                  << std::endl;
+        std::cout << "      expected: " << diff.expected << std::endl;
+        std::cout << "      obtained: " << diff.obtained << std::endl;
+        std::cout << "      absolute diff: " << absolute_diff << std::endl;
+        std::cout << "      relative diff: " << relative_diff << std::endl;
+        std::cout << "         (tolerance level = " << tolerance_level << ")" << std::endl;
+      }
     }
-  };
 
-  return res;
+    return false;
+  }
+  return true;
 }
 
+#include <map>
+
 bool TfLiteInterpMatchApp::run(::tflite::Interpreter &interp, ::tflite::Interpreter &nnapi) const
 {
   assert(interp.outputs() == nnapi.outputs());
 
-  for (const auto &id : interp.outputs())
-  {
+  bool all_matched = true;
+
+  using Comparator = std::function<bool(int id, ::tflite::Interpreter &, ::tflite::Interpreter &)>;
+
+  std::map<TfLiteType, Comparator> comparators;
+
+  comparators[kTfLiteUInt8] = [this](int id, ::tflite::Interpreter &interp,
+                                     ::tflite::Interpreter &nnapi) {
+    const auto expected = nnfw::support::tflite::TensorView<uint8_t>::make(interp, id);
+    const auto obtained = nnfw::support::tflite::TensorView<uint8_t>::make(nnapi, id);
+
+    return compareSingleTensorView(expected, obtained, id);
+  };
+
+  comparators[kTfLiteInt32] = [this](int id, ::tflite::Interpreter &interp,
+                                     ::tflite::Interpreter &nnapi) {
+    const auto expected = nnfw::support::tflite::TensorView<int32_t>::make(interp, id);
+    const auto obtained = nnfw::support::tflite::TensorView<int32_t>::make(nnapi, id);
+
+    return compareSingleTensorView(expected, obtained, id);
+  };
+
+  comparators[kTfLiteFloat32] = [this](int id, ::tflite::Interpreter &interp,
+                                       ::tflite::Interpreter &nnapi) {
     const auto expected = nnfw::support::tflite::TensorView<float>::make(interp, id);
     const auto obtained = nnfw::support::tflite::TensorView<float>::make(nnapi, id);
 
-    DiffSummary summary;
+    return compareSingleTensorView(expected, obtained, id);
+  };
 
-    auto diffs = _comparator.compare(expected, obtained, &summary);
+  for (const auto &id : interp.outputs())
+  {
+    assert(interp.tensor(id)->type == nnapi.tensor(id)->type);
 
-    if (diffs.size() == 0)
-    {
-      std::cout << "  Tensor #" << id << ": MATCHED" << std::endl;
-    }
-    else
-    {
-      std::cout << "  Tensor #" << id << ": UNMATCHED" << std::endl;
-      std::cout << "    " << diffs.size() << " diffs are detected" << std::endl;
-    }
+    auto it = comparators.find(interp.tensor(id)->type);
 
-    // Print out max_diff
-    if (summary.max_abs_diff_value > 0)
+    if (it == comparators.end())
     {
-      std::cout << "    Max absolute diff at [" << nnfw::util::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl;
-      std::cout << "       expected: " << summary.max_abs_diff_expected << std::endl;
-      std::cout << "       obtained: " << summary.max_abs_diff_obtained << std::endl;
-      std::cout << "       absolute diff: " << summary.max_abs_diff_value << std::endl;
+      throw std::runtime_error{"Not supported output type"};
     }
 
-    if (summary.max_rel_diff_value > 0)
-    {
-      const auto tolerance_level = summary.max_rel_diff_value / FLT_EPSILON;
+    const auto &comparator = it->second;
 
-      std::cout << "    Max relative diff at [" << nnfw::util::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl;
-      std::cout << "       expected: " << summary.max_rel_diff_expected << std::endl;
-      std::cout << "       obtained: " << summary.max_rel_diff_obtained << std::endl;
-      std::cout << "       relative diff: " << summary.max_rel_diff_value << std::endl;
-      std::cout << "         (tolerance level = " << tolerance_level << ")" << std::endl;
-    }
-
-    if (diffs.size() > 0)
+    if (!comparator(id, interp, nnapi))
     {
-      if (_verbose != 0)
-      {
-        std::cout << "    ---- Details ---" << std::endl;
-        for (const auto &diff : diffs)
-        {
-          const auto absolute_diff = std::fabs(diff.expected - diff.obtained);
-          const auto relative_diff = nnfw::util::fp32::relative_diff(diff.expected, diff.obtained);
-          const auto tolerance_level = relative_diff / FLT_EPSILON;
-
-          std::cout << "    Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]" << std::endl;
-          std::cout << "      expected: " << diff.expected << std::endl;
-          std::cout << "      obtained: " << diff.obtained << std::endl;
-          std::cout << "      absolute diff: " << absolute_diff << std::endl;
-          std::cout << "      relative diff: " << relative_diff << std::endl;
-          std::cout << "         (tolerance level = " << tolerance_level << ")" << std::endl;
-        }
-      }
-
-      return false;
+      all_matched = false;
     }
   }
 
-  return true;
+  return all_matched;
 }
 
 #include "util/tensor/Object.h"
 
+using namespace std::placeholders;
+
+template <> uint8_t RandomGenerator::generate<uint8_t>(void)
+{
+  // The value of type_range is 255.
+  float type_range = static_cast<float>(std::numeric_limits<uint8_t>::max()) -
+                     static_cast<float>(std::numeric_limits<uint8_t>::min());
+  // Most _dist values range from -5.0 to 5.0.
+  float min_range = -5.0f;
+  float max_range = 5.0f;
+  return static_cast<uint8_t>((_dist(_rand) - min_range) * type_range / (max_range - min_range));
+}
+
+#include "support/tflite/TensorLogger.h"
 //
 // Random Test Runner
 //
 int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
 {
-  auto pure = builder.build();
+  auto tfl_interp = builder.build();
   auto nnapi = builder.build();
 
-  pure->UseNNAPI(false);
-  nnapi->UseNNAPI(true);
+  tfl_interp->UseNNAPI(false);
 
   // Allocate Tensors
-  pure->AllocateTensors();
+  tfl_interp->AllocateTensors();
   nnapi->AllocateTensors();
 
-  assert(pure->inputs() == nnapi->inputs());
+  assert(tfl_interp->inputs() == nnapi->inputs());
 
-  // Fill IFM with random numbers
-  auto ifm_gen = [this] (const nnfw::util::tensor::Shape &, const nnfw::util::tensor::Index &)
-  {
-    // TODO Allow users to set min/max and distribution
-    std::normal_distribution<float> dist(0.0f, 2.0f);
-    return dist(_rand);
+  using ::tflite::Interpreter;
+  using Initializer = std::function<void(int id, Interpreter *, Interpreter *)>;
+
+  std::map<TfLiteType, Initializer> initializers;
+  std::map<TfLiteType, Initializer> reseters;
+
+  // Generate singed 32-bit integer (s32) input
+  initializers[kTfLiteInt32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+    assert(tfl_interp->tensor(id)->type == kTfLiteInt32);
+    assert(nnapi->tensor(id)->type == kTfLiteInt32);
+
+    auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id);
+    auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id);
+
+    assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+    int32_t value = 0;
+
+    nnfw::util::tensor::iterate(tfl_interp_view.shape())
+        << [&](const nnfw::util::tensor::Index &ind) {
+             // TODO Generate random values
+             tfl_interp_view.at(ind) = value;
+             nnapi_view.at(ind) = value;
+             ++value;
+           };
   };
 
-  for (const auto id : pure->inputs())
-  {
-    auto pure_view = nnfw::support::tflite::TensorView<float>::make(*pure, id);
+  // Generate singed 32-bit integer (s32) input
+  reseters[kTfLiteInt32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+    assert(tfl_interp->tensor(id)->type == kTfLiteInt32);
+    assert(nnapi->tensor(id)->type == kTfLiteInt32);
+
+    auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id);
+    auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id);
+
+    assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+    int32_t value = 0;
+
+    nnfw::util::tensor::iterate(tfl_interp_view.shape())
+        << [&](const nnfw::util::tensor::Index &ind) {
+             // TODO Generate random values
+             tfl_interp_view.at(ind) = value;
+             nnapi_view.at(ind) = value;
+           };
+  };
+
+  initializers[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+    assert(tfl_interp->tensor(id)->type == kTfLiteUInt8);
+    assert(nnapi->tensor(id)->type == kTfLiteUInt8);
+
+    auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
+    auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id);
+
+    assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+    auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
+                                                       const ::nnfw::util::tensor::Index &)>(
+        &RandomGenerator::generate<uint8_t>);
+    const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
+                                                   std::bind(fp, _randgen, _1, _2));
+    assert(tfl_interp_view.shape() == data.shape());
+
+    nnfw::util::tensor::iterate(tfl_interp_view.shape())
+        << [&](const nnfw::util::tensor::Index &ind) {
+             const auto value = data.at(ind);
+
+             tfl_interp_view.at(ind) = value;
+             nnapi_view.at(ind) = value;
+           };
+  };
+
+  reseters[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+    assert(tfl_interp->tensor(id)->type == kTfLiteUInt8);
+    assert(nnapi->tensor(id)->type == kTfLiteUInt8);
+
+    auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
+    auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id);
+
+    assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+    auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
+                                                       const ::nnfw::util::tensor::Index &)>(
+        &RandomGenerator::generate<uint8_t>);
+    const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
+                                                   std::bind(fp, _randgen, _1, _2));
+    assert(tfl_interp_view.shape() == data.shape());
+
+    uint8_t value = 0;
+
+    nnfw::util::tensor::iterate(tfl_interp_view.shape())
+        << [&](const nnfw::util::tensor::Index &ind) {
+             tfl_interp_view.at(ind) = value;
+             nnapi_view.at(ind) = value;
+           };
+  };
+
+  initializers[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+    assert(tfl_interp->tensor(id)->type == kTfLiteFloat32);
+    assert(nnapi->tensor(id)->type == kTfLiteFloat32);
+
+    auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id);
     auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id);
 
-    assert(pure_view.shape() == nnapi_view.shape());
+    assert(tfl_interp_view.shape() == nnapi_view.shape());
 
-    const nnfw::util::tensor::Object<float> data(pure_view.shape(), ifm_gen);
+    auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
+                                                     const ::nnfw::util::tensor::Index &)>(
+        &RandomGenerator::generate<float>);
+    const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(),
+                                                 std::bind(fp, _randgen, _1, _2));
+
+    assert(tfl_interp_view.shape() == data.shape());
+
+    nnfw::util::tensor::iterate(tfl_interp_view.shape())
+        << [&](const nnfw::util::tensor::Index &ind) {
+             const auto value = data.at(ind);
+
+             tfl_interp_view.at(ind) = value;
+             nnapi_view.at(ind) = value;
+           };
+  };
 
-    assert(pure_view.shape() == data.shape());
+  reseters[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
+    assert(tfl_interp->tensor(id)->type == kTfLiteFloat32);
+    assert(nnapi->tensor(id)->type == kTfLiteFloat32);
+
+    auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id);
+    auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id);
+
+    assert(tfl_interp_view.shape() == nnapi_view.shape());
+
+    auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &,
+                                                     const ::nnfw::util::tensor::Index &)>(
+        &RandomGenerator::generate<float>);
+    const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(),
+                                                 std::bind(fp, _randgen, _1, _2));
+
+    assert(tfl_interp_view.shape() == data.shape());
+
+    float value = 0;
+
+    nnfw::util::tensor::iterate(tfl_interp_view.shape())
+        << [&](const nnfw::util::tensor::Index &ind) {
+             tfl_interp_view.at(ind) = value;
+             nnapi_view.at(ind) = value;
+           };
+  };
 
-    nnfw::util::tensor::iterate(pure_view.shape()) << [&] (const nnfw::util::tensor::Index &ind)
+  // Fill IFM with random numbers
+  for (const auto id : tfl_interp->inputs())
+  {
+    assert(tfl_interp->tensor(id)->type == nnapi->tensor(id)->type);
+
+    auto it = initializers.find(tfl_interp->tensor(id)->type);
+
+    if (it == initializers.end())
     {
-      const auto value = data.at(ind);
+      throw std::runtime_error{"Not supported input type"};
+    }
 
-      pure_view.at(ind) = value;
-      nnapi_view.at(ind) = value;
-    };
+    it->second(id, tfl_interp.get(), nnapi.get());
+  }
+
+  // Fill OFM with 0
+  for (const auto id : tfl_interp->outputs())
+  {
+    assert(tfl_interp->tensor(id)->type == nnapi->tensor(id)->type);
+
+    auto it = reseters.find(tfl_interp->tensor(id)->type);
+
+    if (it == reseters.end())
+    {
+      throw std::runtime_error{"Not supported input type"};
+    }
+
+    it->second(id, tfl_interp.get(), nnapi.get());
   }
 
   std::cout << "[NNAPI TEST] Run T/F Lite Interpreter without NNAPI" << std::endl;
-  pure->Invoke();
+  tfl_interp->Invoke();
 
   std::cout << "[NNAPI TEST] Run T/F Lite Interpreter with NNAPI" << std::endl;
-  nnapi->Invoke();
+
+  char *env = getenv("UPSTREAM_DELEGATE");
+
+  if (env && !std::string(env).compare("1"))
+  {
+    nnapi->UseNNAPI(true);
+    nnapi->Invoke();
+  }
+  else
+  {
+    nnfw::NNAPIDelegate d;
+
+    if (d.BuildGraph(nnapi.get()))
+    {
+      throw std::runtime_error{"Failed to BuildGraph"};
+    }
+
+    if (d.Invoke(nnapi.get()))
+    {
+      throw std::runtime_error{"Failed to BuildGraph"};
+    }
+  }
 
   // Compare OFM
   std::cout << "[NNAPI TEST] Compare the result" << std::endl;
 
   const auto tolerance = _param.tolerance;
 
-  auto equals = [tolerance] (float lhs, float rhs)
-  {
+  auto equals = [tolerance](float lhs, float rhs) {
     // NOTE Hybrid approach
     // TODO Allow users to set tolerance for absolute_epsilon_equal
     if (nnfw::util::fp32::absolute_epsilon_equal(lhs, rhs))
@@ -245,12 +504,12 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
     return nnfw::util::fp32::epsilon_equal(lhs, rhs, tolerance);
   };
 
-  TfLiteTensorComparator comparator(equals);
+  nnfw::util::tensor::Comparator comparator(equals);
   TfLiteInterpMatchApp app(comparator);
 
   app.verbose() = _param.verbose;
 
-  bool res = app.run(*pure, *nnapi);
+  bool res = app.run(*tfl_interp, *nnapi);
 
   if (!res)
   {
@@ -258,5 +517,22 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder)
   }
 
   std::cout << "[NNAPI TEST] PASSED" << std::endl;
+
+  if (_param.tensor_logging)
+    nnfw::support::tflite::TensorLogger::instance().save(_param.log_path, *tfl_interp);
+
   return 0;
 }
+
+RandomTestRunner RandomTestRunner::make(int seed)
+{
+  RandomTestParam param;
+
+  param.verbose = 0;
+  param.tolerance = 1;
+
+  nnfw::util::env::IntAccessor("VERBOSE").access(param.verbose);
+  nnfw::util::env::IntAccessor("TOLERANCE").access(param.tolerance);
+
+  return RandomTestRunner{seed, param};
+}
diff --git a/libs/support/tflite/src/FeatureView.cpp b/libs/support/tflite/src/FeatureView.cpp
index 50f599d2e..4c7636780 100644
--- a/libs/support/tflite/src/FeatureView.cpp
+++ b/libs/support/tflite/src/FeatureView.cpp
@@ -28,11 +28,8 @@ namespace tflite
 
 nnfw::util::feature::Shape getFeatureShape(const TfLiteTensor *tensor)
 {
-  nnfw::util::feature::Shape shape;
-
-  shape.C = tensor->dims->data[3];
-  shape.H = tensor->dims->data[1];
-  shape.W = tensor->dims->data[2];
+  nnfw::util::feature::Shape shape{tensor->dims->data[3], tensor->dims->data[1],
+                                   tensor->dims->data[2]};
 
   return shape;
 }
diff --git a/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp b/libs/support/tflite/src/Quantization.cpp
index d729d538e..b23204d41 100644
--- a/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp
+++ b/libs/support/tflite/src/Quantization.cpp
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
-#define ACL_CORE_FUNC_NAME neon::depthwiseConvFloat32
-#define ACL_TEST(tc, t) TEST(tc, neon_##t)
+#include "support/tflite/Quantization.h"
 
-#include "../DepthwiseConv2D.test.h"
+TfLiteQuantizationParams make_default_quantization(void)
+{
+  return TfLiteQuantizationParams{0.0f, 0};
+}
diff --git a/libs/support/tflite/src/TensorShapeUtils.cpp b/libs/support/tflite/src/TensorShapeUtils.cpp
new file mode 100644
index 000000000..611ba920e
--- /dev/null
+++ b/libs/support/tflite/src/TensorShapeUtils.cpp
@@ -0,0 +1,51 @@
+#include "support/tflite/TensorShapeUtils.h"
+
+namespace nnfw
+{
+namespace support
+{
+namespace tflite
+{
+
+nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape,
+                                    const nnfw::util::tensor::Shape &rhs_shape)
+{
+  const uint32_t lhs_rank = lhs_shape.rank();
+  const uint32_t rhs_rank = rhs_shape.rank();
+  const uint32_t out_rank = std::max(lhs_rank, rhs_rank);
+
+  // TODO Simplify implementation
+  std::vector<int32_t> lhs_normalized_dims;
+  std::vector<int32_t> rhs_normalized_dims;
+
+  for (uint32_t n = 0; n < out_rank - lhs_rank; ++n)
+  {
+    lhs_normalized_dims.emplace_back(1);
+  }
+  for (uint32_t axis = 0; axis < lhs_rank; ++axis)
+  {
+    lhs_normalized_dims.emplace_back(lhs_shape.dim(axis));
+  }
+
+  for (uint32_t n = 0; n < out_rank - rhs_rank; ++n)
+  {
+    rhs_normalized_dims.emplace_back(1);
+  }
+  for (uint32_t axis = 0; axis < rhs_rank; ++axis)
+  {
+    rhs_normalized_dims.emplace_back(rhs_shape.dim(axis));
+  }
+
+  nnfw::util::tensor::Shape out_shape(out_rank);
+
+  for (uint32_t axis = 0; axis < out_rank; ++axis)
+  {
+    out_shape.dim(axis) = std::max(lhs_normalized_dims.at(axis), rhs_normalized_dims.at(axis));
+  }
+
+  return out_shape;
+}
+
+} // namespace tflite
+} // namespace support
+} // namespace nnfw
diff --git a/libs/support/tflite/src/TensorView.cpp b/libs/support/tflite/src/TensorView.cpp
deleted file mode 100644
index 9e164acc2..000000000
--- a/libs/support/tflite/src/TensorView.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "support/tflite/TensorView.h"
-
-#include <cassert>
-
-namespace nnfw
-{
-namespace support
-{
-namespace tflite
-{
-
-TensorView<float>::TensorView(const nnfw::util::tensor::Shape &shape, float *base) : _shape{shape}, _base{base}
-{
-  // Set 'stride'
-  _stride.init(_shape);
-}
-
-float TensorView<float>::at(const nnfw::util::tensor::Index &index) const
-{
-  const auto offset = _stride.offset(index);
-
-  return *(_base + offset);
-}
-
-float &TensorView<float>::at(const nnfw::util::tensor::Index &index)
-{
-  const auto offset = _stride.offset(index);
-
-  return *(_base + offset);
-}
-
-TensorView<float> TensorView<float>::make(::tflite::Interpreter &interp, int tensor_index)
-{
-  auto tensor_ptr = interp.tensor(tensor_index);
-
-  // TODO Enable the following assets
-  // assert(isFloatTensor(tensor_ptr));
-  // assert(isFeatureTensor(tensor_ptr));
-
-  // Set 'shape'
-  nnfw::util::tensor::Shape shape(tensor_ptr->dims->size);
-
-  for (uint32_t axis = 0; axis < shape.rank(); ++axis)
-  {
-    shape.dim(axis) = tensor_ptr->dims->data[axis];
-  }
-
-  return TensorView<float>(shape, interp.typed_tensor<float>(tensor_index));
-}
-
-} // namespace tflite
-} // namespace support
-} // namespace nnfw
diff --git a/libs/support/tflite/src/TensorView.test.cpp b/libs/support/tflite/src/TensorView.test.cpp
index 75993a6da..1d3a70500 100644
--- a/libs/support/tflite/src/TensorView.test.cpp
+++ b/libs/support/tflite/src/TensorView.test.cpp
@@ -18,9 +18,24 @@
 
 #include <cassert>
 
+void int_test(void)
+{
+  int value[6] = {1, 2, 3, 4, 5, 6};
+
+  const nnfw::util::tensor::Shape shape{2, 3};
+  const nnfw::support::tflite::TensorView<int> view{shape, value};
+
+  assert(view.at(nnfw::util::tensor::Index{0, 0}) == 1);
+  assert(view.at(nnfw::util::tensor::Index{0, 1}) == 2);
+  assert(view.at(nnfw::util::tensor::Index{0, 2}) == 3);
+  assert(view.at(nnfw::util::tensor::Index{1, 0}) == 4);
+  assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5);
+  assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6);
+}
+
 int main(int argc, char **argv)
 {
-  float value[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+  float value[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
 
   const nnfw::util::tensor::Shape shape{2, 3};
   const nnfw::support::tflite::TensorView<float> view{shape, value};
@@ -32,5 +47,7 @@ int main(int argc, char **argv)
   assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5.0f);
   assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6.0f);
 
+  int_test();
+
   return 0;
 }
diff --git a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp b/libs/support/tflite/src/interp/FlatBufferBuilder.cpp
index f46c74652..67df13f34 100644
--- a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp
+++ b/libs/support/tflite/src/interp/FlatBufferBuilder.cpp
@@ -16,7 +16,7 @@
 
 #include "support/tflite/interp/FlatBufferBuilder.h"
 
-#include <tensorflow/contrib/lite/kernels/register.h>
+#include "support/tflite/kernels/register.h"
 
 namespace nnfw
 {
diff --git a/libs/support/tflite/src/kernels/RSQRT.cpp b/libs/support/tflite/src/kernels/RSQRT.cpp
new file mode 100644
index 000000000..13efe0ed9
--- /dev/null
+++ b/libs/support/tflite/src/kernels/RSQRT.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "support/tflite/kernels/RSQRT.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <cmath>
+#include <iostream>
+
+namespace tflite
+{
+namespace ops
+{
+namespace custom
+{
+namespace nnfw
+{
+namespace RSQRT
+{
+
+void *InitRSQRT(TfLiteContext *context, const char *buffer, size_t length) { return nullptr; }
+
+void FreeRSQRT(TfLiteContext *context, void *buffer) {}
+
+TfLiteStatus PrepareRSQRT(TfLiteContext *context, TfLiteNode *node)
+{
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor *input = GetInput(context, node, 0);
+  TfLiteTensor *output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  // Quantized float is not supported yet.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims));
+}
+
+inline TfLiteStatus Eval(TfLiteContext *context, TfLiteNode *node, float float_func(float))
+{
+  const TfLiteTensor *input = GetInput(context, node, 0);
+  TfLiteTensor *output = GetOutput(context, node, 0);
+  switch (input->type)
+  {
+    case kTfLiteFloat32:
+    {
+      size_t elements = NumElements(input);
+      const float *in = input->data.f;
+      const float *in_end = in + elements;
+      float *out = output->data.f;
+      for (; in < in_end; in++, out++)
+        *out = float_func(*in);
+      return kTfLiteOk;
+    }
+    default:
+    {
+      context->ReportError(context, "Input type is %d, requires float32", input->type);
+      return kTfLiteError;
+    }
+  }
+}
+
+TfLiteStatus EvalRSQRT(TfLiteContext *context, TfLiteNode *node)
+{
+  return Eval(context, node, [](float f) { return 1.f / std::sqrt(f); });
+}
+
+} // namespace RSQRT
+} // namespace nnfw
+} // namespace custom
+} // namespace ops
+} // namespace tflite
diff --git a/libs/support/tflite/src/kernels/SquaredDifference.cpp b/libs/support/tflite/src/kernels/SquaredDifference.cpp
new file mode 100644
index 000000000..25e10a8ed
--- /dev/null
+++ b/libs/support/tflite/src/kernels/SquaredDifference.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "support/tflite/kernels/SquaredDifference.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <iostream>
+
+namespace tflite
+{
+namespace ops
+{
+namespace custom
+{
+namespace nnfw
+{
+namespace SquaredDifference
+{
+
+void *InitSquaredDifference(TfLiteContext *context, const char *buffer, size_t length)
+{
+  return nullptr;
+}
+
+void FreeSquaredDifference(TfLiteContext *context, void *buffer) {}
+
+TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node)
+{
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor *input1 = GetInput(context, node, 0);
+  const TfLiteTensor *input2 = GetInput(context, node, 1);
+  TfLiteTensor *output = GetOutput(context, node, 0);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+
+  return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input1->dims));
+}
+
+TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node)
+{
+
+  const TfLiteTensor *input1 = GetInput(context, node, 0);
+  const TfLiteTensor *input2 = GetInput(context, node, 1);
+
+  TfLiteTensor *output = GetOutput(context, node, 0);
+
+  size_t elements = NumElements(input1);
+
+  switch (input1->type)
+  {
+    case kTfLiteFloat32:
+    {
+      const float *in1 = input1->data.f;
+      const float *in2 = input2->data.f;
+      const float *in_end1 = in1 + elements;
+      float *out = output->data.f;
+
+      for (; in1 < in_end1; in1++, in2++, out++)
+        *out = ((*in1 - *in2) * (*in1 - *in2));
+
+      return kTfLiteOk;
+    }
+    case kTfLiteInt32:
+    {
+      const int *in1 = input1->data.i32;
+      const int *in2 = input2->data.i32;
+      const int *in_end1 = in1 + elements;
+      int *out = output->data.i32;
+
+      for (; in1 < in_end1; in1++, in2++, out++)
+        *out = ((*in1 - *in2) * (*in1 - *in2));
+
+      return kTfLiteOk;
+    }
+    case kTfLiteInt64:
+    {
+      const int64_t *in1 = input1->data.i64;
+      const int64_t *in2 = input1->data.i64;
+      const int64_t *in_end1 = in1 + elements;
+      int64_t *out = output->data.i64;
+
+      for (; in1 < in_end1; in1++, in2++, out++)
+        *out = ((*in1 - *in2) * (*in1 - *in2));
+
+      return kTfLiteOk;
+    }
+    default:
+    {
+      context->ReportError(context, "InputType is %d Unsupported", input1->type);
+      return kTfLiteError;
+    }
+  }
+}
+
+} // namespace SquaredDifference
+} // nnfw
+} // namespace custom
+} // namespace ops
+} // namespace tflite
diff --git a/libs/support/tflite/src/kernels/TensorFlowMax.cpp b/libs/support/tflite/src/kernels/TensorFlowMax.cpp
new file mode 100644
index 000000000..abc6fda4e
--- /dev/null
+++ b/libs/support/tflite/src/kernels/TensorFlowMax.cpp
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "support/tflite/kernels/TensorFlowMax.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <iostream>
+
+namespace tflite
+{
+namespace ops
+{
+namespace custom
+{
+namespace nnfw
+{
+namespace TensorFlowMax
+{
+
+struct TensorFlowMaxOp
+{
+  TensorFlowMaxOp(TfLiteContext *context, TfLiteNode *node)
+  {
+    input = tflite::GetInput(context, node, 0);
+    axis = tflite::GetInput(context, node, 1);
+    output = tflite::GetOutput(context, node, 0);
+  }
+  const TfLiteTensor *input;
+  const TfLiteTensor *axis;
+  TfLiteTensor *output;
+};
+
+void *InitTensorFlowMax(TfLiteContext *context, const char *buffer, size_t length)
+{
+  // Creates two temp tensors to store index and axis for internal
+  // implementation only.
+  auto *scratch_tensor_index = new int;
+  context->AddTensors(context, 2, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void FreeTensorFlowMax(TfLiteContext *context, void *buffer)
+{
+  delete static_cast<TensorFlowMaxOp *>(buffer);
+}
+
+// Resizes the temp tensor that stores resolved axis.
+TfLiteStatus ResizeTempAxis(TfLiteContext *context, TensorFlowMaxOp *op_context,
+                            TfLiteTensor *resolved_axis)
+{
+  TfLiteIntArray *axis_size = TfLiteIntArrayCreate(1);
+  axis_size->data[0] = static_cast<int>(tflite::NumElements(op_context->axis));
+  return context->ResizeTensor(context, resolved_axis, axis_size);
+}
+
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowMaxOp *op_context)
+{
+  size_t num_axis = tflite::NumElements(op_context->axis);
+  const TfLiteIntArray *input_dims = op_context->input->dims;
+  int input_num_dims = tflite::NumDimensions(op_context->input);
+  const int *axis = op_context->axis->data.i32;
+
+  {
+    // Calculates size of reducing axis.
+    int num_reduce_axis = num_axis;
+    for (int i = 0; i < num_axis; ++i)
+    {
+      int current = axis[i];
+      if (current < 0)
+      {
+        current += input_num_dims;
+      }
+      TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims);
+      for (int j = 0; j < i; ++j)
+      {
+        int previous = axis[j];
+        if (previous < 0)
+        {
+          previous += input_num_dims;
+        }
+        if (current == previous)
+        {
+          --num_reduce_axis;
+          break;
+        }
+      }
+    }
+    // Determines output dimensions.
+    TfLiteIntArray *output_dims = TfLiteIntArrayCreate(input_num_dims - num_reduce_axis);
+    int num_skip_axis = 0;
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+      {
+        if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx)
+        {
+          ++num_skip_axis;
+          is_axis = true;
+          break;
+        }
+      }
+      if (!is_axis)
+      {
+        output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
+      }
+    }
+    return context->ResizeTensor(context, op_context->output, output_dims);
+  }
+}
+
+// Initializes temp tensors to store index and resolved axis.
+TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node,
+                                   TensorFlowMaxOp *op_context)
+{
+  // Creates a temp index to iterate through input data.
+  int *scratch_tensor_index = reinterpret_cast<int *>(node->user_data);
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor *scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  scratch_tensor->type = kTfLiteInt32;
+  scratch_tensor->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray *index_size = TfLiteIntArrayCreate(1);
+  index_size->data[0] = tflite::NumDimensions(op_context->input);
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, index_size));
+
+  // Creates a temp tensor to store resolved axis given input data.
+  node->temporaries->data[1] = *scratch_tensor_index + 1;
+  TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
+  resolved_axis->type = kTfLiteInt32;
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareTensorFlowMax(TfLiteContext *context, TfLiteNode *node)
+{
+  TF_LITE_ENSURE_EQ(context, tflite::NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, tflite::NumOutputs(node), 1);
+
+  TensorFlowMaxOp op_context(context, node);
+  TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+
+  TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Leaves work to Eval if axis is not constant; else resizes output.
+  if (!tflite::IsConstantTensor(op_context.axis))
+  {
+    tflite::SetTensorToDynamic(op_context.output);
+    tflite::SetTensorToDynamic(resolved_axis);
+    return kTfLiteOk;
+  }
+  resolved_axis->allocation_type = kTfLiteArenaRw;
+  TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis));
+  return ResizeOutputTensor(context, &op_context);
+}
+
+// Gets offset of index if expanded on axis. When expanded, the flattened offset
+// will not change, if the output index changes on the given axis. For example,
+// if you have a 2D tensor and you are expanding to 3D on axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map from the same flattened
+// offset.
+inline size_t ExpandedInputOffset(const int num_dims, const int *dims, const int *index,
+                                  const int num_axis, const int *axis)
+{
+  size_t offset = 0;
+  int out_idx = 0;
+  for (int in_idx = 0; in_idx < num_dims; ++in_idx)
+  {
+    // if we need to expand this axis
+    bool is_axis = false;
+    if (axis != nullptr)
+    {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+      {
+        if (in_idx == axis[axis_idx])
+        {
+          is_axis = true;
+          break;
+        }
+      }
+    }
+    if (!is_axis)
+    {
+      offset = offset * static_cast<size_t>(dims[in_idx]) + static_cast<size_t>(index[out_idx]);
+      out_idx++;
+    }
+    else
+    {
+      offset = offset * static_cast<size_t>(dims[in_idx]);
+    }
+  }
+  return offset;
+}
+
+// Gets offset of index if reducing on axis. When reducing, the flattened offset
+// will not change, if the input index changes on the given axis. For example,
+// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
+// offset.
+// TODO(kanlig): uses Dims to represent dimensions.
+inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
+                                  const int num_axis, const int *axis)
+{
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx)
+  {
+    // if we need to skip this axis
+    bool is_axis = false;
+    if (axis != nullptr)
+    {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
+      {
+        if (idx == axis[axis_idx])
+        {
+          is_axis = true;
+          break;
+        }
+      }
+    }
+    if (!is_axis)
+    {
+      offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
+    }
+  }
+  return offset;
+}
+
+// Gets next index to iterate through a multidimensional array.
+inline bool NextIndex(TfLiteContext *context, const int num_dims, const int *dims, int *current)
+{
+  int carry = 1;
+  for (int idx = num_dims - 1; idx >= 0; --idx)
+  {
+    int current_val = current[idx] + carry;
+    TF_LITE_ENSURE(context, (dims[idx] >= current_val));
+    if (dims[idx] == current_val)
+    {
+      current[idx] = 0;
+    }
+    else
+    {
+      current[idx] = current_val;
+      carry = 0;
+      break;
+    }
+  }
+  return (carry == 0);
+}
+
+template <typename T>
+inline TfLiteStatus
+CustomMax(TfLiteContext *context, T *input_data, const int *input_dims, const int input_num_dims,
+          T *output_data, const int *output_dims, const int output_num_dims, const int *axis,
+          const int num_axis_dimensions, bool keep_dims, int *temp_index, int *resolved_axis)
+{
+  // resolves axis.
+  int num_resolved_axis = 0;
+  for (int idx = 0; idx < num_axis_dimensions; ++idx)
+  {
+    int current = axis[idx];
+    TF_LITE_ENSURE(context, (current < input_num_dims && current + input_num_dims >= 0));
+    if (current < 0)
+    {
+      current += input_num_dims;
+    }
+    bool is_dup = false;
+    for (int j = 0; j < num_resolved_axis; ++j)
+    {
+      if (resolved_axis[j] == current)
+      {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup)
+    {
+      resolved_axis[num_resolved_axis++] = current;
+    }
+  }
+
+  TF_LITE_ENSURE(context, (input_num_dims > 0));
+  TF_LITE_ENSURE(context, (input_dims != nullptr));
+  TF_LITE_ENSURE(context, (temp_index != nullptr));
+
+  // resets output data.
+  for (int idx = 0; idx < output_num_dims; ++idx)
+  {
+    temp_index[idx] = 0;
+  }
+  for (bool has_next = true; has_next;
+       has_next = NextIndex(context, output_num_dims, output_dims, temp_index))
+  {
+    size_t output_offset =
+        ReducedOutputOffset(output_num_dims, output_dims, temp_index, 0, nullptr);
+    size_t input_offset = ExpandedInputOffset(input_num_dims, input_dims, temp_index,
+                                              num_resolved_axis, resolved_axis);
+    output_data[output_offset] = input_data[input_offset];
+  }
+
+  // resets temp index.
+  for (int idx = 0; idx < input_num_dims; ++idx)
+  {
+    temp_index[idx] = 0;
+  }
+
+  // iterates through input_data.
+  for (bool has_next = true; has_next;
+       has_next = NextIndex(context, input_num_dims, input_dims, temp_index))
+  {
+    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
+    size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index,
+                                               num_resolved_axis, resolved_axis);
+    if (output_data[output_offset] < input_data[input_offset])
+    {
+      output_data[output_offset] = input_data[input_offset];
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node)
+{
+
+  TensorFlowMaxOp op_context(context, node);
+  int num_axis = static_cast<int>(tflite::NumElements(op_context.axis));
+  TfLiteTensor *temp_index = &context->tensors[node->temporaries->data[0]];
+  TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Resize the output tensor if the output tensor is dynamic.
+  if (tflite::IsDynamicTensor(op_context.output))
+  {
+    TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+  TfLiteStatus returnStatus = kTfLiteOk;
+  switch (op_context.input->type)
+  {
+    case kTfLiteFloat32:
+      returnStatus = CustomMax<float>(
+          context, op_context.input->data.f, op_context.input->dims->data,
+          op_context.input->dims->size, op_context.output->data.f, op_context.output->dims->data,
+          op_context.output->dims->size, op_context.axis->data.i32, num_axis, false,
+          temp_index->data.i32, resolved_axis->data.i32);
+      break;
+    case kTfLiteInt32:
+      returnStatus = CustomMax<int>(context, op_context.input->data.i32,
+                                    op_context.input->dims->data, op_context.input->dims->size,
+                                    op_context.output->data.i32, op_context.output->dims->data,
+                                    op_context.output->dims->size, op_context.axis->data.i32,
+                                    num_axis, false, temp_index->data.i32, resolved_axis->data.i32);
+      break;
+    case kTfLiteUInt8:
+      returnStatus = CustomMax<uint8_t>(
+          context, op_context.input->data.uint8, op_context.input->dims->data,
+          op_context.input->dims->size, op_context.output->data.uint8,
+          op_context.output->dims->data, op_context.output->dims->size, op_context.axis->data.i32,
+          num_axis, false, temp_index->data.i32, resolved_axis->data.i32);
+      break;
+    case kTfLiteInt64:
+      returnStatus = CustomMax<int64_t>(
+          context, op_context.input->data.i64, op_context.input->dims->data,
+          op_context.input->dims->size, op_context.output->data.i64, op_context.output->dims->data,
+          op_context.output->dims->size, op_context.axis->data.i32, num_axis, false,
+          temp_index->data.i32, resolved_axis->data.i32);
+      break;
+    default:
+      returnStatus = kTfLiteError;
+  }
+
+  return returnStatus;
+}
+} // namespace TensorFlowMax
+} // namespace nnfw
+} // namespace custom
+} // namespace ops
+} // namespace tflite
diff --git a/libs/support/tflite/src/kernels/register.cpp b/libs/support/tflite/src/kernels/register.cpp
new file mode 100644
index 000000000..6700b4de4
--- /dev/null
+++ b/libs/support/tflite/src/kernels/register.cpp
@@ -0,0 +1,169 @@
+/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE This code is derived from the following file (in TensorFlow)
+//        'externals/tensorflow/tensorflow/contrib/lite/kernels/register.cc'
+#include "support/tflite/kernels/register.h"
+#include "support/tflite/kernels/CustomOps.h"
+
+// TODO Use namespace nnfw
+namespace tflite
+{
+namespace ops
+{
+namespace builtin
+{
+
+TfLiteRegistration *Register_RELU();
+TfLiteRegistration *Register_RELU_N1_TO_1();
+TfLiteRegistration *Register_RELU6();
+TfLiteRegistration *Register_TANH();
+TfLiteRegistration *Register_LOGISTIC();
+TfLiteRegistration *Register_AVERAGE_POOL_2D();
+TfLiteRegistration *Register_MAX_POOL_2D();
+TfLiteRegistration *Register_L2_POOL_2D();
+TfLiteRegistration *Register_CONV_2D();
+TfLiteRegistration *Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration *Register_SVDF();
+TfLiteRegistration *Register_RNN();
+TfLiteRegistration *Register_BIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration *Register_UNIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration *Register_EMBEDDING_LOOKUP();
+TfLiteRegistration *Register_EMBEDDING_LOOKUP_SPARSE();
+TfLiteRegistration *Register_FULLY_CONNECTED();
+TfLiteRegistration *Register_LSH_PROJECTION();
+TfLiteRegistration *Register_HASHTABLE_LOOKUP();
+TfLiteRegistration *Register_SOFTMAX();
+TfLiteRegistration *Register_CONCATENATION();
+TfLiteRegistration *Register_ADD();
+TfLiteRegistration *Register_SPACE_TO_BATCH_ND();
+TfLiteRegistration *Register_DIV();
+TfLiteRegistration *Register_SUB();
+TfLiteRegistration *Register_BATCH_TO_SPACE_ND();
+TfLiteRegistration *Register_MUL();
+TfLiteRegistration *Register_L2_NORMALIZATION();
+TfLiteRegistration *Register_LOCAL_RESPONSE_NORMALIZATION();
+TfLiteRegistration *Register_LSTM();
+TfLiteRegistration *Register_BIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration *Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration *Register_PAD();
+TfLiteRegistration *Register_PADV2();
+TfLiteRegistration *Register_RESHAPE();
+TfLiteRegistration *Register_RESIZE_BILINEAR();
+TfLiteRegistration *Register_SKIP_GRAM();
+TfLiteRegistration *Register_SPACE_TO_DEPTH();
+TfLiteRegistration *Register_GATHER();
+TfLiteRegistration *Register_TRANSPOSE();
+TfLiteRegistration *Register_MEAN();
+TfLiteRegistration *Register_SPLIT();
+TfLiteRegistration *Register_SQUEEZE();
+TfLiteRegistration *Register_STRIDED_SLICE();
+TfLiteRegistration *Register_EXP();
+TfLiteRegistration *Register_TOPK_V2();
+TfLiteRegistration *Register_LOG_SOFTMAX();
+TfLiteRegistration *Register_CAST();
+TfLiteRegistration *Register_DEQUANTIZE();
+TfLiteRegistration *Register_PRELU();
+TfLiteRegistration *Register_MAXIMUM();
+TfLiteRegistration *Register_MINIMUM();
+TfLiteRegistration *Register_ARG_MAX();
+TfLiteRegistration *Register_GREATER();
+TfLiteRegistration *Register_GREATER_EQUAL();
+TfLiteRegistration *Register_LESS();
+TfLiteRegistration *Register_LESS_EQUAL();
+TfLiteRegistration *Register_FLOOR();
+TfLiteRegistration *Register_NEG();
+TfLiteRegistration *Register_SELECT();
+TfLiteRegistration *Register_SLICE();
+TfLiteRegistration *Register_SIN();
+TfLiteRegistration *Register_TRANSPOSE_CONV();
+TfLiteRegistration *Register_SPARSE_TO_DENSE();
+
+BuiltinOpResolver::BuiltinOpResolver()
+{
+  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+  AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, Register_BIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, Register_UNIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE, Register_EMBEDDING_LOOKUP_SPARSE());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED());
+  AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
+  AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND());
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, Register_LOCAL_RESPONSE_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, Register_BIDIRECTIONAL_SEQUENCE_LSTM());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+  AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_DIV, Register_DIV());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_EXP, Register_EXP());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
+  AddBuiltin(BuiltinOperator_CAST, Register_CAST());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
+
+  AddCustom("TensorFlowMax", tflite::ops::custom::nnfw::Register_TensorFlowMax());
+  AddCustom("RSQRT", tflite::ops::custom::nnfw::Register_RSQRT());
+  AddCustom("SquaredDifference", tflite::ops::custom::nnfw::Register_SquaredDifference());
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/libs/support/tflite/src/nnapi_delegate.cpp b/libs/support/tflite/src/nnapi_delegate.cpp
new file mode 100644
index 000000000..1eada4bca
--- /dev/null
+++ b/libs/support/tflite/src/nnapi_delegate.cpp
@@ -0,0 +1,720 @@
+/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+   Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE To minimize diff with upstream tensorflow, disable clang-format
+// clang-format off
+
+// NOTE This code is derived from the following file (in TensorFlow)
+//        'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc'
+#include "support/tflite/nnapi_delegate.h"
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "NeuralNetworksShim.h"
+#include "NeuralNetworksExShim.h"
+
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif
+
+namespace nnfw
+{
+
+// TODO(aselle): FATAL leaves resources hanging.
+void FATAL(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+  fflush(stderr);
+  exit(1);
+}
+
+// TODO(aselle): Change the error model to use status codes.
+#define CHECK_TFLITE_SUCCESS(x)                       \
+  if (x != kTfLiteOk) {                               \
+    FATAL("Aborting since tflite returned failure."); \
+  }
+
+#define CHECK_NN(x)                                   \
+  if (x != ANEURALNETWORKS_NO_ERROR) {                \
+    FATAL("Aborting since tflite returned failure."); \
+  }
+
+namespace {
+
+int32_t GetAndroidSdkVersion() {
+#ifdef __ANDROID__
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher then expected;
+        return 0xFFFF;
+      }
+    }
+    return atoi(sdkVersion);
+  }
+  FATAL("No %s prop", sdkProp);
+#endif  // __ANDROID__
+  return 0;
+}
+
+static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+
+}  // namespace
+
+NNAPIAllocation::NNAPIAllocation(const char* filename,
+                                 ::tflite::ErrorReporter* error_reporter)
+    : MMAPAllocation(filename, error_reporter) {
+  if (mmapped_buffer_ != MAP_FAILED)
+    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
+                                                mmap_fd_, 0, &handle_));
+}
+
+NNAPIAllocation::~NNAPIAllocation() {
+  if (handle_) {
+    ANeuralNetworksMemory_free(handle_);
+  }
+}
+
+NNAPIDelegate::~NNAPIDelegate() {
+  if (nn_compiled_model_) {
+    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    nn_compiled_model_ = nullptr;
+  }
+  if (nn_model_) {
+    ANeuralNetworksModel_free(nn_model_);
+    nn_model_ = nullptr;
+    // TODO(aselle): Is this thread-safe and callable multiple times?
+  }
+  // ANeuralNetworksShutdown();
+}
+
+// Adds the tensors of the interpreter to the NN API model.
+// Returns the number of operands added.
+uint32_t addTensorOperands(tflite::Interpreter* interpreter,
+                               ANeuralNetworksModel* nn_model,
+                               const std::vector<uint32_t>& skip_list) {
+  uint32_t next_id = 0;
+  for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+    // skip temporaries tensors.
+    bool shouldSkip = false;
+    for (auto skip_idx : skip_list) {
+      if (i == skip_idx) {
+        shouldSkip = true;
+        break;
+      }
+    }
+    if (shouldSkip) continue;
+
+    int32_t nn_type = 0;
+    // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
+    float scale = 0.0f;
+    int32_t zeroPoint = 0;
+    TfLiteTensor* tensor = interpreter->tensor(i);
+    switch (tensor->type) {
+      case kTfLiteNoType:
+        // Tensors added during initialization of Ops don't have a type yet and
+        // should not be registered with the NNAPI.
+        continue;
+      case kTfLiteFloat32:
+        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+        break;
+      case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        // FIXME The next line is a workaround because currently zero scale is
+        // passed down from TF
+        //       Lite.  Note that the latest NeuralNetworks.h (see
+        //       https://android.googlesource.com/platform/frameworks/ml/+/master/nn/runtime/include/NeuralNetworks.h)
+        //       requires scale to be greater than zero.  Remove this workaround
+        //       when the scale
+        //       value is correctly passed.
+        scale = (scale == 0.0f) ? 1.0f : scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      case kTfLiteInt32:
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      default:
+        FATAL("Unsupported type.");
+    }
+    // TODO(aselle): Note, many of these are intermediate results. Do I need
+    // to ever specify these sizes. I am currently below doing setValue
+    // on all of them, but I shouldn't in the future.
+    // Answer(jeanluc): If all the operators can set the dimension correctly,
+    // you won't need to.
+    ANeuralNetworksOperandType operand_type{
+        nn_type, static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+    // TODO(aselle): Based on Michael's suggestion, limiting this to read
+    // only memory
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
+              static_cast<const ::tflite::Allocation*>(tensor->allocation))) {
+        CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory(
+            nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw),
+            tensor->bytes));
+      } else {
+        CHECK_NN(ANeuralNetworksModel_setOperandValue(
+            nn_model, next_id, tensor->data.raw, tensor->bytes));
+      }
+    } else if (tensor->bytes == 0) {
+      // These size 0 tensors are optional tensors reserved.
+      CHECK_NN(
+          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
+    }
+
+    ++next_id;
+  }
+  return next_id;
+}
+
+// Adds the operations and their parameters to the NN API model.
+// 'next-id' is the operand ID of the next operand of the model.
+void AddOpsAndParams(tflite::Interpreter* interpreter,
+                     ANeuralNetworksModel* nn_model, uint32_t next_id,
+                     std::vector<int>* model_state_inputs,
+                     std::vector<int>* model_state_outputs) {
+  for (size_t i = 0; i < interpreter->nodes_size(); i++) {
+    const auto* node_and_registration = interpreter->node_and_registration(i);
+    const TfLiteNode& node = node_and_registration->first;
+    const TfLiteRegistration& registration = node_and_registration->second;
+    tflite::BuiltinOperator builtin =
+        static_cast<tflite::BuiltinOperator>(registration.builtin_code);
+
+    // Add the parameters.
+    std::vector<uint32_t> augmented_inputs(
+        node.inputs->data, node.inputs->data + node.inputs->size);
+    std::vector<uint32_t> augmented_outputs(
+        node.outputs->data, node.outputs->data + node.outputs->size);
+
+    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+                             &next_id](int value) {
+      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+                                                    sizeof(int32_t)))
+      augmented_inputs.push_back(next_id++);
+    };
+
+    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+                               &next_id](float value) {
+      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+                                                    sizeof(float)))
+      augmented_inputs.push_back(next_id++);
+    };
+
+    // Handle state tensors of RNN, LSTM, SVDF.
+    // For each state_out tensor, a corresponding state_in operand needs to be
+    // created for NNAPI.
+    auto duplicate_state_tensor_float32 =
+        [interpreter, &nn_model, &next_id, &augmented_inputs,
+         &model_state_inputs, &model_state_outputs](int tensor_id) {
+          const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
+          ANeuralNetworksOperandType operand_type{
+              ANEURALNETWORKS_TENSOR_FLOAT32,
+              static_cast<uint32_t>(tensor->dims->size),
+              reinterpret_cast<uint32_t*>(tensor->dims->data),
+              tensor->params.scale, tensor->params.zero_point};
+          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+          augmented_inputs.push_back(next_id);
+          model_state_inputs->push_back(next_id);
+          model_state_outputs->push_back(tensor_id);
+          next_id++;
+        };
+
+    auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
+
+    auto add_pooling_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->filter_width);
+      add_scalar_int32(builtin->filter_height);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_convolution_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_depthwise_conv_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->depth_multiplier);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_fully_connected_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_concatenation_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
+      add_scalar_int32(builtin->axis);
+      if (builtin->activation != kTfLiteActNone) {
+        FATAL("Concatenation does not support fused activation in NNAPI");
+      }
+    };
+
+    auto add_softmax_params = [&add_scalar_float32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
+      add_scalar_float32(builtin->beta);
+    };
+
+    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
+      add_scalar_int32(builtin->block_size);
+    };
+
+    auto add_lstm_params = [&add_scalar_int32,
+                            &add_scalar_float32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
+      add_scalar_int32(builtin->activation);
+      add_scalar_float32(builtin->cell_clip);
+      add_scalar_float32(builtin->proj_clip);
+    };
+
+    // LSTM in NNAPI requires scratch tensor as an output operand.
+    auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
+                                            &next_id, &augmented_outputs]() {
+      int scratch_buffer_index = node.temporaries->data[0];
+      const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
+      ANeuralNetworksOperandType operand_type{
+          ANEURALNETWORKS_TENSOR_FLOAT32,
+          static_cast<uint32_t>(tensor->dims->size),
+          reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
+          tensor->params.zero_point};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+      augmented_outputs.insert(augmented_outputs.begin(), next_id++);
+    };
+
+    auto add_mean_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteMeanParams*>(data);
+      add_scalar_int32(builtin->keep_dims);
+    };
+
+    auto add_svdf_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
+      add_scalar_int32(builtin->rank);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_rnn_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
+      add_scalar_int32(builtin->activation);
+    };
+
+    // Handle optional input tensors.
+    auto add_optional_tensors = [&nn_model, &augmented_inputs,
+                                 &next_id](int nn_type) {
+      for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
+        if (augmented_inputs[idx] == kOptionalTensor) {
+          const std::vector<uint32_t> dim = {0, 0};
+          ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
+          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
+                                                        nullptr, 0))
+          augmented_inputs[idx] = next_id++;
+        }
+      }
+    };
+
+    int nnapi_version = 10;
+#include "nnapi_delegate_ex_AddOpsAndParams_lambda.inc"
+
+    ANeuralNetworksOperationType nn_op_type;
+
+    switch (builtin) {
+      case tflite::BuiltinOperator_ADD:
+        nn_op_type = ANEURALNETWORKS_ADD;
+        add_add_params();
+        break;
+      case tflite::BuiltinOperator_MUL:
+        nn_op_type = ANEURALNETWORKS_MUL;
+        add_add_params();
+        break;
+      case tflite::BuiltinOperator_AVERAGE_POOL_2D:
+        add_pooling_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_MAX_POOL_2D:
+        add_pooling_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_L2_POOL_2D:
+        add_pooling_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_CONV_2D:
+        add_convolution_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_CONV_2D;
+        break;
+      case tflite::BuiltinOperator_RELU:
+        nn_op_type = ANEURALNETWORKS_RELU;
+        break;
+      case tflite::BuiltinOperator_RELU_N1_TO_1:
+        nn_op_type = ANEURALNETWORKS_RELU1;
+        break;
+      case tflite::BuiltinOperator_RELU6:
+        nn_op_type = ANEURALNETWORKS_RELU6;
+        break;
+      case tflite::BuiltinOperator_TANH:
+        nn_op_type = ANEURALNETWORKS_TANH;
+        break;
+      case tflite::BuiltinOperator_FLOOR:
+        nn_op_type = ANEURALNETWORKS_FLOOR;
+        break;
+      case tflite::BuiltinOperator_LOGISTIC:
+        nn_op_type = ANEURALNETWORKS_LOGISTIC;
+        break;
+      case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
+        add_depthwise_conv_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
+        break;
+      case tflite::BuiltinOperator_CONCATENATION:
+        add_concatenation_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_CONCATENATION;
+        break;
+      case tflite::BuiltinOperator_SOFTMAX:
+        add_softmax_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SOFTMAX;
+        break;
+      case tflite::BuiltinOperator_FULLY_CONNECTED:
+        add_fully_connected_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
+        break;
+      case tflite::BuiltinOperator_RESHAPE:
+        nn_op_type = ANEURALNETWORKS_RESHAPE;
+        // add_reshape_params(node.builtin_data);
+        break;
+      case tflite::BuiltinOperator_RESIZE_BILINEAR:
+        add_resize_bilinear_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR;
+        break;
+      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+        add_space_to_depth_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
+        break;
+      case tflite::BuiltinOperator_LSTM: {
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kOutputStateTensor*/ 0]);
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kCellStateTensor*/ 1]);
+        add_lstm_params(node.builtin_data);
+        add_lstm_scratch_tensor_float32();
+        add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
+        nn_op_type = ANEURALNETWORKS_LSTM;
+        break;
+      }
+      case tflite::BuiltinOperator_DEQUANTIZE:
+        nn_op_type = ANEURALNETWORKS_DEQUANTIZE;
+        break;
+      case tflite::BuiltinOperator_SVDF: {
+        duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
+        add_svdf_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SVDF;
+        break;
+      }
+      case tflite::BuiltinOperator_RNN: {
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kHiddenStateTensor*/ 0]);
+        add_rnn_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_RNN;
+        break;
+      }
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
+        nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
+        break;
+      case tflite::BuiltinOperator_PAD:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_PAD;
+        break;
+      case tflite::BuiltinOperator_MEAN:
+        nnapi_version = 11;  // require NNAPI 1.1
+        add_mean_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_MEAN;
+        break;
+      case tflite::BuiltinOperator_DIV:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_DIV;
+        add_add_params();
+        break;
+      case tflite::BuiltinOperator_SUB:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_SUB;
+        add_add_params();
+        break;
+      case tflite::BuiltinOperator_STRIDED_SLICE:
+        add_strided_slice_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_STRIDED_SLICE;
+        break;
+      case tflite::BuiltinOperator_CAST:
+        CHECK_NN(ANeuralNetworksModel_addOperationEx(
+            nn_model, ANEURALNETWORKS_CAST_EX,
+            static_cast<uint32_t>(augmented_inputs.size()),
+            augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+            reinterpret_cast<uint32_t*>(node.outputs->data)));
+        continue;
+      case tflite::BuiltinOperator_TOPK_V2:
+        CHECK_NN(ANeuralNetworksModel_addOperationEx(
+            nn_model, ANEURALNETWORKS_TOPK_V2_EX,
+            static_cast<uint32_t>(augmented_inputs.size()),
+            augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+            reinterpret_cast<uint32_t*>(node.outputs->data)));
+        continue;
+      case tflite::BuiltinOperator_GATHER:
+        add_gather_ex_params(node.builtin_data);
+        CHECK_NN(ANeuralNetworksModel_addOperationEx(
+            nn_model, ANEURALNETWORKS_GATHER_EX,
+            static_cast<uint32_t>(augmented_inputs.size()),
+            augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+            reinterpret_cast<uint32_t*>(node.outputs->data)));
+        continue;
+      case tflite::BuiltinOperator_SPLIT:
+        CHECK_NN(ANeuralNetworksModel_addOperationEx(
+            nn_model, ANEURALNETWORKS_SPLIT_EX,
+            static_cast<uint32_t>(augmented_inputs.size()),
+            augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+            reinterpret_cast<uint32_t*>(node.outputs->data)));
+        continue;
+      case tflite::BuiltinOperator_TRANSPOSE:
+        nn_op_type = ANEURALNETWORKS_TRANSPOSE;
+        // param is almost same as reshape
+        break;
+      case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
+      case tflite::BuiltinOperator_LSH_PROJECTION:
+      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+      case tflite::BuiltinOperator_L2_NORMALIZATION:
+      case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
+      case tflite::BuiltinOperator_PADV2:
+      case tflite::BuiltinOperator_CALL:
+      case tflite::BuiltinOperator_SKIP_GRAM:
+      case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
+      case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
+      case tflite::BuiltinOperator_SQUEEZE:
+      case tflite::BuiltinOperator_EXP:
+      case tflite::BuiltinOperator_LOG_SOFTMAX:
+      case tflite::BuiltinOperator_DELEGATE:
+      case tflite::BuiltinOperator_PRELU:
+      case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_MINIMUM:
+      case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_GREATER:
+      case tflite::BuiltinOperator_GREATER_EQUAL:
+      case tflite::BuiltinOperator_LESS:
+      case tflite::BuiltinOperator_LESS_EQUAL:
+      case tflite::BuiltinOperator_NEG:
+      case tflite::BuiltinOperator_SELECT:
+      case tflite::BuiltinOperator_SLICE:
+      case tflite::BuiltinOperator_SIN:
+      case tflite::BuiltinOperator_TRANSPOSE_CONV:
+      case tflite::BuiltinOperator_SPARSE_TO_DENSE:
+        FATAL("Op code %d is currently not delegated to NNAPI", builtin);
+        nn_op_type = -1;  // set to invalid
+        break;
+      case tflite::BuiltinOperator_CUSTOM:
+        std::string custom_name(registration.custom_name);
+        if (custom_name.compare("TensorFlowMax") == 0) {
+          CHECK_NN(ANeuralNetworksModel_addOperationEx(
+              nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX,
+              static_cast<uint32_t>(augmented_inputs.size()),
+              augmented_inputs.data(),
+              static_cast<uint32_t>(node.outputs->size),
+              reinterpret_cast<uint32_t*>(node.outputs->data)));
+          continue;
+        }
+        else if (custom_name.compare("RSQRT") == 0) {
+          CHECK_NN(ANeuralNetworksModel_addOperationEx(
+              nn_model, ANEURALNETWORKS_RSQRT_EX,
+              static_cast<uint32_t>(augmented_inputs.size()),
+              augmented_inputs.data(),
+              static_cast<uint32_t>(node.outputs->size),
+              reinterpret_cast<uint32_t*>(node.outputs->data)));
+          continue;
+        }
+        else if (custom_name.compare("SquaredDifference") == 0) {
+          CHECK_NN(ANeuralNetworksModel_addOperationEx(
+              nn_model, ANEURALNETWORKS_SQUARED_DIFFERENCE_EX,
+              static_cast<uint32_t>(augmented_inputs.size()),
+              augmented_inputs.data(),
+              static_cast<uint32_t>(node.outputs->size),
+              reinterpret_cast<uint32_t*>(node.outputs->data)));
+          continue;
+        }
+
+        FATAL("Custom operations are not supported when using NNAPI.");
+        nn_op_type = -1;  // set to invalid
+        break;
+    }
+
+    //if (nnapi_version == 11 && kAndroidSdkVersion < 28) {
+    //  FATAL("Op %d needs NNAPI1.1", builtin);
+    //}
+
+    // Add the operation.
+    CHECK_NN(ANeuralNetworksModel_addOperation(
+        nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
+        augmented_inputs.data(),
+        static_cast<uint32_t>(augmented_outputs.size()),
+        reinterpret_cast<uint32_t*>(augmented_outputs.data())));
+  }
+}
+
+TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) {
+  // TODO(aselle): This is not correct. need to handle resize invalidation.
+  if (nn_model_ && nn_compiled_model_) return kTfLiteOk;
+
+  if (!nn_model_) {
+    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+
+    // Find all the temporary tensors and put them in a skip_list.
+    std::vector<uint32_t> skip_list;
+    for (size_t i = 0; i < interpreter->nodes_size(); i++) {
+      const auto* node_and_registration = interpreter->node_and_registration(i);
+      const TfLiteNode& node = node_and_registration->first;
+      if (node.temporaries != nullptr) {
+        for (int j = 0; j < node.temporaries->size; j++) {
+          skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j]));
+        }
+      }
+    }
+
+    uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list);
+    AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+                    &model_states_outputs_);
+
+    std::vector<int> augmented_inputs = interpreter->inputs();
+    std::vector<int> augmented_outputs = interpreter->outputs();
+
+    // All state tensors input/output need to be treated as model input/output.
+    augmented_inputs.insert(augmented_inputs.end(),
+                            model_states_inputs_.begin(),
+                            model_states_inputs_.end());
+    augmented_outputs.insert(augmented_outputs.end(),
+                             model_states_outputs_.begin(),
+                             model_states_outputs_.end());
+
+    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+        nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
+        reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
+        static_cast<uint32_t>(augmented_outputs.size()),
+        reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
+    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+  }
+  if (!nn_compiled_model_) {
+    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
+    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) {
+  if (!nn_model_) {
+    TF_LITE_ENSURE_STATUS(BuildGraph(interpreter));
+  }
+
+  ANeuralNetworksExecution* execution = nullptr;
+  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+
+  // Currently perform deep copy of input buffer
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input = interpreter->inputs()[i];
+    // TODO(aselle): Is this what we want or do we want input instead?
+    // TODO(aselle): This should be called setInputValue maybe to be cons.
+    TfLiteTensor* tensor = interpreter->tensor(input);
+    CHECK_NN(ANeuralNetworksExecution_setInput(
+        execution, i, nullptr, tensor->data.raw, tensor->bytes));
+  }
+
+  // Tell nn api where to place final data.
+  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
+    int output = interpreter->outputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(output);
+    CHECK_NN(ANeuralNetworksExecution_setOutput(
+        execution, i, nullptr, tensor->data.raw, tensor->bytes));
+  }
+
+  // The state_out of previous invocation need to be mapped to state_in of
+  // current invocation.
+  for (size_t i = 0; i < model_states_outputs_.size(); i++) {
+    int state_tensor_idx = model_states_outputs_[i];
+    TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
+    // Here we are using a deep copy for state_in tensors so that we are not
+    // reading and writing into the same buffer during a invocation.
+    // TODO(miaowang): using double shared buffer to minimize the copies.
+    CHECK_NN(ANeuralNetworksExecution_setInput(
+        execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
+        tensor->bytes));
+    // Tell NNAPI where to output the state_out.
+    CHECK_NN(ANeuralNetworksExecution_setOutput(
+        execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
+        tensor->bytes));
+  }
+
+  // Currently use blocking compute.
+  ANeuralNetworksEvent* event = nullptr;
+  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
+  CHECK_NN(ANeuralNetworksEvent_wait(event));
+  ANeuralNetworksEvent_free(event);
+  ANeuralNetworksExecution_free(execution);
+
+#if 0
+  printf("From the NN API:\n");
+  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  if (float* data =
+          interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+    size_t num = tensor->bytes / sizeof(float);
+    for (float* p = data; p < data + num; p++) {
+      printf(" %f", *p);
+    }
+    printf("\n");
+  }
+#endif
+
+  return kTfLiteOk;
+}
+
+} // namespace nnfw
+
+// clang-format on
diff --git a/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc b/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc
new file mode 100644
index 000000000..ea485fe45
--- /dev/null
+++ b/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc
@@ -0,0 +1,41 @@
+// This file is included from AddOpsAndParams defined in nnapi_delegate.cc
+// and contains lambda for extened implementation to original Tensorflow Lite.
+    auto add_resize_bilinear_params = [&add_scalar_int32, &interpreter, &augmented_inputs](void* data) {
+      auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(data);
+      if  (builtin->align_corners) {
+        FATAL("Resize bilinear does not support align corners in NNAPI");
+      }
+
+      TfLiteTensor* tensor = interpreter->tensor(augmented_inputs.back());
+      assert(tensor->type == kTfLiteInt32);
+      assert(tensor->bytes == sizeof(int)*2);
+      augmented_inputs.pop_back();
+
+      int height = ((int*)(tensor->data.raw))[1];
+      int width = ((int*)(tensor->data.raw))[0];
+      add_scalar_int32(height);
+      add_scalar_int32(width);
+    };
+
+    auto add_strided_slice_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(data);
+      add_scalar_int32(builtin->begin_mask);
+      add_scalar_int32(builtin->end_mask);
+      // ellipsis_mask and new_axis_mask are not supported on nn runtime
+      // cf) tflite interpreter supports both operations
+      if (builtin->ellipsis_mask) {
+        FATAL("STRIDE_SLICE does not support ellipsis_mask in NNAPI");
+      }
+      if (builtin->new_axis_mask) {
+        FATAL("STRIDE_SLICE does not support new_axis_mask in NNAPI");
+      }
+      add_scalar_int32(builtin->shrink_axis_mask);
+    };
+
+    auto add_gather_ex_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteGatherParams*>(data);
+      add_scalar_int32(builtin->axis);
+      if (builtin->axis != 0) {
+        FATAL("GATHER does not support axis>0 in NNAPI");
+      }
+    };
diff --git a/libs/util/CMakeLists.txt b/libs/util/CMakeLists.txt
index 565aaf75e..eaa7ae8cf 100644
--- a/libs/util/CMakeLists.txt
+++ b/libs/util/CMakeLists.txt
@@ -3,12 +3,18 @@ set(NNFW_UTILITY_SRCS src/environment.cpp)
 list(APPEND NNFW_UTILITY_SRCS src/tensor/Shape.cpp)
 list(APPEND NNFW_UTILITY_SRCS src/tensor/NonIncreasingStride.cpp)
 list(APPEND NNFW_UTILITY_SRCS src/tensor/IndexFormatter.cpp)
-
-set(NNFW_INCLUDE_DIR include)
+list(APPEND NNFW_UTILITY_SRCS src/tensor/Comparator.cpp)
+if(BUILD_TFLITE_BENCHMARK_MODEL)
+  list(APPEND NNFW_UTILITY_SRCS src/profiling/time.cc)
+endif()
 
 add_library(nnfw_util SHARED ${NNFW_UTILITY_SRCS})
 target_include_directories(nnfw_util PUBLIC ${NNFW_INCLUDE_DIR})
 
+add_library(static_nnfw_util STATIC ${NNFW_UTILITY_SRCS})
+target_include_directories(static_nnfw_util PUBLIC ${NNFW_INCLUDE_DIR})
+set_target_properties(static_nnfw_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
 install(TARGETS nnfw_util
         RUNTIME DESTINATION bin COMPONENT libraries
         LIBRARY DESTINATION lib COMPONENT libraries)
diff --git a/libs/util/examples/tensor_index_iterator.cpp b/libs/util/examples/tensor_index_iterator.cpp
index a05d78dc4..284e04aa0 100644
--- a/libs/util/examples/tensor_index_iterator.cpp
+++ b/libs/util/examples/tensor_index_iterator.cpp
@@ -16,16 +16,52 @@
 
 #include "util/tensor/IndexIterator.h"
 
+#include <array>
+
 #include <iostream>
+#include <algorithm>
+
+#include <cassert>
+
+void test_iterate(void)
+{
+  const nnfw::util::tensor::Shape shape{3, 4, 7};
+
+  std::array<int, 3 * 4 * 7> array;
+
+  array.fill(0);
+
+  using nnfw::util::tensor::iterate;
+  using nnfw::util::tensor::Index;
+
+  iterate(shape) << [&](const Index &index) {
+    assert(index.rank() == shape.rank());
+
+    const size_t rank = index.rank();
+
+    uint32_t offset = index.at(0);
+
+    for (size_t axis = 1; axis < rank; ++axis)
+    {
+      offset *= shape.dim(axis);
+      offset += index.at(axis);
+    }
+
+    array[offset] += 1;
+  };
+
+  assert(std::all_of(array.begin(), array.end(), [](int num) { return num == 1; }));
+}
 
 int main(int argc, char **argv)
 {
+  test_iterate();
+
   nnfw::util::tensor::Shape shape{3, 4, 3, 4};
 
   std::cout << "Iterate over tensor{3, 4, 3, 4}" << std::endl;
 
-  nnfw::util::tensor::iterate(shape) << [] (const nnfw::util::tensor::Index &index)
-  {
+  nnfw::util::tensor::iterate(shape) << [](const nnfw::util::tensor::Index &index) {
     std::cout << "rank: " << index.rank() << std::endl;
 
     for (size_t d = 0; d < index.rank(); ++d)
diff --git a/libs/util/include/util/benchmark.h b/libs/util/include/util/benchmark.h
deleted file mode 100644
index c451eddec..000000000
--- a/libs/util/include/util/benchmark.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_BENCHMARK_H__
-#define __NNFW_UTIL_BENCHMARK_H__
-
-#include <chrono>
-
-namespace nnfw
-{
-namespace util
-{
-// Benckmark support
-namespace benchmark
-{
-
-template <typename T> class Accumulator
-{
-public:
-  Accumulator(T &ref) : _ref(ref)
-  {
-    // DO NOTHING
-  }
-
-public:
-  T &operator()(void) { return _ref; }
-
-private:
-  T &_ref;
-};
-
-template <typename T, typename Callable>
-Accumulator<T> &operator<<(Accumulator<T> &&acc, Callable cb)
-{
-  auto begin = std::chrono::steady_clock::now();
-  cb();
-  auto end = std::chrono::steady_clock::now();
-
-  acc() += std::chrono::duration_cast<T>(end - begin);
-
-  return acc;
-}
-
-template <typename T> Accumulator<T> measure(T &out)
-{
-  return Accumulator<T>(out);
-}
-
-} // namespace benchmark
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_BENCHMARK_H__
diff --git a/libs/util/include/util/environment.h b/libs/util/include/util/environment.h
deleted file mode 100644
index fa9dd519d..000000000
--- a/libs/util/include/util/environment.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __UTIL_ENVIRONMENT_H__
-#define __UTIL_ENVIRONMENT_H__
-
-namespace nnfw
-{
-namespace util
-{
-
-int get_env_int(const char* name);
-bool get_env_bool(const char* name);
-
-}
-}
-
-#include <string>
-
-namespace nnfw
-{
-namespace util
-{
-namespace env
-{
-
-template <typename T> struct Accessor
-{
-  virtual ~Accessor() = default;
-
-  virtual bool access(T &out) const = 0;
-};
-
-class IntAccessor : public Accessor<int>
-{
-public:
-  IntAccessor(const std::string &tag);
-
-public:
-  bool access(int &out) const override;
-
-private:
-  std::string _tag;
-};
-
-} // namespace env
-} // namespace util
-} // namespace nnfw
-
-#endif // __UTIL_ENVIRONMENT_H__
diff --git a/libs/util/include/util/feature/Index.h b/libs/util/include/util/feature/Index.h
deleted file mode 100644
index e77816669..000000000
--- a/libs/util/include/util/feature/Index.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FEATURE_INDEX_H__
-#define __NNFW_UTIL_FEATURE_INDEX_H__
-
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace feature
-{
-
-class Index
-{
-public:
-  Index() = default;
-
-public:
-  Index(int32_t ch, int32_t row, int32_t col) : _ch{ch}, _row{row}, _col{col}
-  {
-    // DO NOTHING    
-  }
-
-public:
-  int32_t ch(void) const { return _ch; }
-  int32_t row(void) const { return _row; }
-  int32_t col(void) const { return _col; }
-
-public:
-  int32_t &ch(void) { return _ch; }
-  int32_t &row(void) { return _row; }
-  int32_t &col(void) { return _col; }
-
-private:
-  int32_t _ch;
-  int32_t _row;
-  int32_t _col;
-};
-
-} // namespace feature
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_INDEX_H__
diff --git a/libs/util/include/util/feature/IndexIterator.h b/libs/util/include/util/feature/IndexIterator.h
deleted file mode 100644
index dd029f4b6..000000000
--- a/libs/util/include/util/feature/IndexIterator.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__
-#define __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__
-
-#include "util/feature/Shape.h"
-
-namespace nnfw
-{
-namespace util
-{
-namespace feature
-{
-
-class IndexIterator
-{
-public:
-  IndexIterator(const Shape &shape) : _shape{shape}
-  {
-    // DO NOTHING
-  }
-
-public:
-  template <typename Callable> IndexIterator &iter(Callable cb)
-  {
-    for (uint32_t ch = 0; ch < _shape.C; ++ch)
-    {
-      for (uint32_t row = 0; row < _shape.H; ++row)
-      {
-        for (uint32_t col = 0; col < _shape.W; ++col)
-        {
-          cb(ch, row, col);
-        }
-      }
-    }
-
-    return (*this);
-  }
-
-private:
-  const Shape _shape;
-};
-
-IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
-
-template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
-{
-  return it.iter(cb);
-}
-
-} // namespace feature
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__
diff --git a/libs/util/include/util/feature/Object.h b/libs/util/include/util/feature/Object.h
deleted file mode 100644
index ca217b4a8..000000000
--- a/libs/util/include/util/feature/Object.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FEATURE_OBJECT_H__
-#define __NNFW_UTIL_FEATURE_OBJECT_H__
-
-#include "util/feature/Shape.h"
-#include "util/feature/Index.h"
-#include "util/feature/Reader.h"
-
-#include <vector>
-
-namespace nnfw
-{
-namespace util
-{
-namespace feature
-{
-
-template<typename T> class Object final : public Reader<T>
-{
-public:
-  using Generator = std::function<T (const Shape &shape, const Index &index)>;
-
-public:
-  Object(const Shape &shape, const Generator &fn) : _shape{shape}
-  {
-    _value.resize(_shape.C * _shape.H * _shape.W);
-
-    for (int32_t ch = 0; ch < _shape.C; ++ch)
-    {
-      for (int32_t row = 0; row < _shape.H; ++row)
-      {
-        for (int32_t col = 0; col < _shape.W; ++col)
-        {
-          _value.at(offsetOf(ch, row, col)) = fn(_shape, Index{ch, row, col});
-        }
-      }
-    }
-  }
-
-public:
-  const Shape &shape(void) const { return _shape; }
-
-public:
-  T at(uint32_t ch, uint32_t row, uint32_t col) const override
-  {
-    return _value.at(offsetOf(ch, row, col));
-  }
-
-private:
-  uint32_t offsetOf(uint32_t ch, uint32_t row, uint32_t col) const
-  {
-    return ch * _shape.H * _shape.W + row * _shape.W + col;
-  }
-
-private:
-  Shape _shape;
-  std::vector<T> _value;
-};
-
-} // namespace feature
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_OBJECT_H__
diff --git a/libs/util/include/util/feature/Reader.h b/libs/util/include/util/feature/Reader.h
deleted file mode 100644
index 112503d80..000000000
--- a/libs/util/include/util/feature/Reader.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FEATURE_READER_H__
-#define __NNFW_UTIL_FEATURE_READER_H__
-
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace feature
-{
-
-template <typename T> struct Reader
-{
-  virtual ~Reader() = default;
-
-  virtual T at(uint32_t ch, uint32_t row, uint32_t col) const = 0;
-};
-
-} // namespace feature
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_READER_H__
diff --git a/libs/util/include/util/feature/Shape.h b/libs/util/include/util/feature/Shape.h
deleted file mode 100644
index e05c97f51..000000000
--- a/libs/util/include/util/feature/Shape.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FEATURE_SHAPE_H__
-#define __NNFW_UTIL_FEATURE_SHAPE_H__
-
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace feature
-{
-
-struct Shape
-{
-  int32_t C; // Depth
-  int32_t H; // Height
-  int32_t W; // Width
-
-  Shape() = default;
-  Shape(int32_t depth, int32_t height, int32_t width) : C{depth}, H{height}, W{width}
-  {
-    // DO NOTHING
-  }
-
-};
-
-} // namespace feature
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_H__
diff --git a/libs/util/include/util/feature/TextFormatter.h b/libs/util/include/util/feature/TextFormatter.h
deleted file mode 100644
index 91b4c9fff..000000000
--- a/libs/util/include/util/feature/TextFormatter.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FEATURE_TEXT_FORMATTER_H__
-#define __NNFW_UTIL_FEATURE_TEXT_FORMATTER_H__
-
-#include "util/feature/Shape.h"
-#include "util/feature/Reader.h"
-
-#include <ostream>
-#include <iomanip>
-#include <limits>
-
-namespace nnfw
-{
-namespace util
-{
-namespace feature
-{
-
-template <typename T> class TextFormatter
-{
-public:
-  TextFormatter(const Shape &shape, const Reader<T> &data)
-    : _shape(shape), _data(data)
-  {
-    // DO NOTHING
-  }
-
-public:
-  const Shape &shape(void) const { return _shape; }
-  const Reader<T> &data(void) const { return _data; }
-
-private:
-  const Shape &_shape;
-  const Reader<T> &_data;
-};
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os, const TextFormatter<T> &fmt)
-{
-  const auto &shape = fmt.shape();
-
-  for (uint32_t ch = 0; ch < shape.C; ++ch)
-  {
-    os << "  Channel " << ch << ":" << std::endl;
-    for (uint32_t row = 0; row < shape.H; ++row)
-    {
-      os << "    ";
-      for (uint32_t col = 0; col < shape.W; ++col)
-      {
-        const auto value = fmt.data().at(ch, row, col);
-        os << std::right;
-        os << std::fixed;
-        os << std::setw(std::numeric_limits<T>::digits10 + 2);
-        os << std::setprecision(5);
-        os << value;
-        os << " ";
-      }
-      os << std::endl;
-    }
-  }
-
-  return os;
-}
-
-} // namespace feature
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_TEXT_FORMATTER_H__
diff --git a/libs/util/include/util/fp32.h b/libs/util/include/util/fp32.h
deleted file mode 100644
index 604435470..000000000
--- a/libs/util/include/util/fp32.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_FP32_H__
-#define __NNFW_UTIL_FP32_H__
-
-#include <cmath>
-#include <cfloat>
-#include <algorithm>
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace fp32
-{
-
-inline float relative_diff(float lhs, float rhs)
-{
-  const auto diff = std::fabs(lhs - rhs);
-  const auto base = std::max(std::fabs(lhs), std::fabs(rhs));
-
-  return diff / base;
-}
-
-inline bool epsilon_equal(float expected, float obtained, uint32_t tolerance = 1)
-{
-  if (std::isnan(expected) && std::isnan(obtained))
-  {
-    return true;
-  }
-
-  // Let's use relative epsilon comparision
-  const auto diff = std::fabs(expected - obtained);
-  const auto max = std::max(std::fabs(expected), std::fabs(obtained));
-
-  return diff <= (max * FLT_EPSILON * tolerance);
-}
-
-inline bool absolute_epsilon_equal(float expected, float obtained, float tolerance = 0.001)
-{
-  if (std::isnan(expected) && std::isnan(obtained))
-  {
-    return true;
-  }
-
-  // Let's use absolute epsilon comparision
-  const auto diff = std::fabs(expected - obtained);
-
-  return diff <= tolerance;
-}
-
-} // namespace fp32
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FP32_H__
diff --git a/libs/util/include/util/kernel/IndexIterator.h b/libs/util/include/util/kernel/IndexIterator.h
deleted file mode 100644
index ea6b48826..000000000
--- a/libs/util/include/util/kernel/IndexIterator.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_KERNEL_INDEX_ITERATOR_H__
-#define __NNFW_UTIL_KERNEL_INDEX_ITERATOR_H__
-
-#include "util/kernel/Shape.h"
-
-namespace nnfw
-{
-namespace util
-{
-namespace kernel
-{
-
-class IndexIterator
-{
-public:
-  IndexIterator(const Shape &shape) : _shape{shape}
-  {
-    // DO NOTHING
-  }
-
-public:
-  template <typename Callable> IndexIterator &iter(Callable cb)
-  {
-    for (uint32_t nth = 0; nth < _shape.N; ++nth)
-    {
-      for (uint32_t ch = 0; ch < _shape.C; ++ch)
-      {
-        for (uint32_t row = 0; row < _shape.H; ++row)
-        {
-          for (uint32_t col = 0; col < _shape.W; ++col)
-          {
-            cb(nth, ch, row, col);
-          }
-        }
-      }
-    }
-
-    return (*this);
-  }
-
-private:
-  const Shape _shape;
-};
-
-IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
-
-template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
-{
-  return it.iter(cb);
-}
-
-} // namespace kernel
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__
diff --git a/libs/util/include/util/kernel/RandomObject.h b/libs/util/include/util/kernel/RandomObject.h
deleted file mode 100644
index ceed7a0b0..000000000
--- a/libs/util/include/util/kernel/RandomObject.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_KERNEL_RANDOM_OBJECT_H__
-#define __NNFW_UTIL_KERNEL_RANDOM_OBJECT_H__
-
-#include "util/kernel/Shape.h"
-#include "util/kernel/Reader.h"
-
-#include <vector>
-
-namespace nnfw
-{
-namespace util
-{
-namespace kernel
-{
-
-template<typename T> class RandomObject final : public Reader<T>
-{
-public:
-  RandomObject(const Shape &shape) : _shape{shape}
-  {
-    const uint32_t size = _shape.N * _shape.C * _shape.H * _shape.W;
-
-    // TODO Use random number
-    for (uint32_t off = 0; off < size; ++off)
-    {
-      _value.emplace_back(static_cast<float>(off));
-    }
-  }
-
-public:
-  const Shape &shape(void) const { return _shape; }
-
-public:
-  T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const override
-  {
-    uint32_t index = 0;
-
-    index += nth * _shape.C * _shape.H * _shape.W;
-    index += ch * _shape.H * _shape.W;
-    index += row * _shape.W;
-    index += col;
-
-    return _value.at(index);
-  }
-
-private:
-  const Shape _shape;
-  std::vector<T> _value;
-};
-
-} // namespace kernel
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_KERNEL_RANDOM_OBJECT_H__
diff --git a/libs/util/include/util/kernel/Reader.h b/libs/util/include/util/kernel/Reader.h
deleted file mode 100644
index 9d8f33ad6..000000000
--- a/libs/util/include/util/kernel/Reader.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_KERNEL_READER_H__
-#define __NNFW_UTIL_KERNEL_READER_H__
-
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace kernel
-{
-
-template <typename T> struct Reader
-{
-  virtual ~Reader() = default;
-
-  virtual T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const = 0;
-};
-
-} // namespace kernel
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_KERNEL_READER_H__
diff --git a/libs/util/include/util/kernel/Shape.h b/libs/util/include/util/kernel/Shape.h
deleted file mode 100644
index bd2332989..000000000
--- a/libs/util/include/util/kernel/Shape.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_KERNEL_SHAPE_H__
-#define __NNFW_UTIL_KERNEL_SHAPE_H__
-
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace kernel
-{
-
-struct Shape
-{
-  int32_t N;
-  int32_t C;
-  int32_t H;
-  int32_t W;
-
-  Shape() = default;
-  Shape(int32_t count, int32_t depth, int32_t height, int32_t width)
-      : N{count}, C{depth}, H{height}, W{width}
-  {
-    // DO NOTHING
-  }
-};
-
-} // namespace kernel
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_KERNEL_SHAPE_H__
diff --git a/libs/util/include/util/tensor/Index.h b/libs/util/include/util/tensor/Index.h
deleted file mode 100644
index e74b09229..000000000
--- a/libs/util/include/util/tensor/Index.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_INDEX_H__
-#define __NNFW_UTIL_TENSOR_INDEX_H__
-
-#include <cstdint>
-#include <cstddef>
-
-#include <vector>
-#include <initializer_list>
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-struct Index
-{
-public:
-  Index(size_t rank)
-  {
-    _offsets.resize(rank);
-  }
-
-public:
-  Index(std::initializer_list<int32_t> offsets) : _offsets{offsets}
-  {
-    // DO NOTHING
-  }
-
-public:
-  size_t rank(void) const { return _offsets.size(); }
-
-public:
-  int32_t at(size_t n) const { return _offsets.at(n); }
-  int32_t &at(size_t n) { return _offsets.at(n); }
-
-private:
-  std::vector<int32_t> _offsets;
-};
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_INDEX_H__
diff --git a/libs/util/include/util/tensor/IndexFormatter.h b/libs/util/include/util/tensor/IndexFormatter.h
deleted file mode 100644
index 8014a42b6..000000000
--- a/libs/util/include/util/tensor/IndexFormatter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_INDEX_FORMATTER_H__
-#define __NNFW_UTIL_TENSOR_INDEX_FORMATTER_H__
-
-#include "util/tensor/Index.h"
-
-#include <ostream>
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-class IndexFormatter
-{
-public:
-  IndexFormatter(const nnfw::util::tensor::Index &index) : _index(index)
-  {
-    // DO NOTHING
-  }
-
-public:
-  const nnfw::util::tensor::Index &index(void) const { return _index; }
-
-private:
-  const nnfw::util::tensor::Index &_index;
-};
-
-std::ostream &operator<<(std::ostream &os, const IndexFormatter &fmt);
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_INDEX_FORMATTER_H__
diff --git a/libs/util/include/util/tensor/IndexIterator.h b/libs/util/include/util/tensor/IndexIterator.h
deleted file mode 100644
index 56a8c7dd2..000000000
--- a/libs/util/include/util/tensor/IndexIterator.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_INDEX_ITERATOR_H__
-#define __NNFW_UTIL_TENSOR_INDEX_ITERATOR_H__
-
-#include "util/tensor/Shape.h"
-#include "util/tensor/Index.h"
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-class IndexIterator
-{
-public:
-  IndexIterator(const Shape &shape) : _shape(shape)
-  {
-    // DO NOTHING
-  }
-
-public:
-  // Allow move, but disallow copy
-  IndexIterator(IndexIterator &&) = default;
-  IndexIterator(const IndexIterator &) = delete;
-
-public:
-  template <typename Callable> IndexIterator &iter(Callable fn)
-  {
-    Index index(_shape.rank());
-
-    for (size_t d = 0; d < _shape.rank(); ++d)
-    {
-      index.at(d) = 0;
-    }
-
-    size_t cursor = 0;
-
-    while (cursor < _shape.rank())
-    {
-      fn(index);
-
-      if (index.at(cursor) + 1 < _shape.dim(cursor))
-      {
-        index.at(cursor) += 1;
-      }
-      else
-      {
-        while ((cursor < _shape.rank()) && (index.at(cursor) + 1 == _shape.dim(cursor)))
-        {
-          ++cursor;
-        }
-
-        if (cursor == _shape.rank())
-        {
-          break;
-        }
-
-        index.at(cursor) += 1;
-
-        for (size_t d = 0; d < cursor; ++d)
-        {
-          index.at(d) = 0;
-        }
-
-        cursor = 0;
-      }
-    }
-
-    return (*this);
-  }
-
-private:
-  const Shape &_shape;
-};
-
-inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; }
-
-template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb)
-{
-  return it.iter(cb);
-}
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_INDEX_ITERATOR_H__
diff --git a/libs/util/include/util/tensor/NonIncreasingStride.h b/libs/util/include/util/tensor/NonIncreasingStride.h
deleted file mode 100644
index ff013ffa2..000000000
--- a/libs/util/include/util/tensor/NonIncreasingStride.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_NON_INCREASING_STRIDE_H__
-#define __NNFW_UTIL_TENSOR_NON_INCREASING_STRIDE_H__
-
-#include "util/tensor/Shape.h"
-#include "util/tensor/Index.h"
-
-#include <vector>
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-// As its name suggests, stride[N-1] >= stride[N] holds for all N < rank in NonIncreasingStride.
-class NonIncreasingStride
-{
-public:
-  void init(const Shape &shape)
-  {
-    _stride.resize(shape.rank());
-    _stride.at(shape.rank() - 1) = 1;
-
-    for (uint32_t axis = shape.rank() - 1; axis > 0; --axis)
-    {
-      _stride.at(axis - 1) = _stride.at(axis) * shape.dim(axis);
-    }
-  }
-
-public:
-  uint32_t at(uint32_t axis) const { return _stride.at(axis); }
-
-public:
-  uint32_t offset(const Index &index) const;
-
-private:
-  std::vector<uint32_t> _stride;
-};
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_NON_INCREASING_STRIDE_H__
diff --git a/libs/util/include/util/tensor/Object.h b/libs/util/include/util/tensor/Object.h
deleted file mode 100644
index 839bce236..000000000
--- a/libs/util/include/util/tensor/Object.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_OBJECT_H__
-#define __NNFW_UTIL_TENSOR_OBJECT_H__
-
-#include "util/tensor/Shape.h"
-#include "util/tensor/Index.h"
-#include "util/tensor/IndexIterator.h"
-#include "util/tensor/NonIncreasingStride.h"
-#include "util/tensor/Reader.h"
-
-#include <vector>
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-template<typename T> class Object final : public Reader<T>
-{
-public:
-  using Generator = std::function<T (const Shape &shape, const Index &index)>;
-
-public:
-  Object(const Shape &shape, const Generator &fn) : _shape{shape}
-  {
-    // Set 'stride'
-    _stride.init(shape);
-
-    // Pre-allocate buffer
-    _values.resize(_shape.dim(0) * _stride.at(0));
-
-    // Set 'value'
-    iterate(_shape) << [this, &fn] (const Index &index)
-    {
-      _values.at(_stride.offset(index)) = fn(_shape, index);
-    };
-  }
-
-public:
-  const Shape &shape(void) const { return _shape; }
-
-public:
-  T at(const Index &index) const override
-  {
-    return _values.at(_stride.offset(index));
-  }
-
-private:
-  Shape _shape;
-  NonIncreasingStride _stride;
-
-private:
-  std::vector<T> _values;
-};
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_FEATURE_OBJECT_H__
diff --git a/libs/util/include/util/tensor/Reader.h b/libs/util/include/util/tensor/Reader.h
deleted file mode 100644
index 654214880..000000000
--- a/libs/util/include/util/tensor/Reader.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_READER_H__
-#define __NNFW_UTIL_TENSOR_READER_H__
-
-#include "util/tensor/Index.h"
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-template <typename T> struct Reader
-{
-  virtual ~Reader() = default;
-
-  virtual T at(const Index &index) const = 0;
-};
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_READER_H__
diff --git a/libs/util/include/util/tensor/Shape.h b/libs/util/include/util/tensor/Shape.h
deleted file mode 100644
index d4edeaada..000000000
--- a/libs/util/include/util/tensor/Shape.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_SHAPE_H__
-#define __NNFW_UTIL_TENSOR_SHAPE_H__
-
-#include <cstdint>
-#include <cstddef>
-#include <vector>
-#include <initializer_list>
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-class Shape
-{
-public:
-  Shape(size_t rank)
-  {
-    _dimensions.resize(rank);
-  }
-
-public:
-  Shape(const std::initializer_list<int32_t> &dimensions) : _dimensions{dimensions}
-  {
-    // DO NOTHING
-  }
-
-public:
-  size_t rank(void) const { return _dimensions.size(); }
-
-public:
-  int32_t dim(size_t n) const { return _dimensions.at(n); }
-  int32_t &dim(size_t n) { return _dimensions.at(n); }
-
-private:
-  std::vector<int32_t> _dimensions;
-};
-
-bool operator==(const Shape &, const Shape &);
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_SHAPE_H__
diff --git a/libs/util/include/util/tensor/Zipper.h b/libs/util/include/util/tensor/Zipper.h
deleted file mode 100644
index fc2d94e57..000000000
--- a/libs/util/include/util/tensor/Zipper.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_TENSOR_ZIPPER_H__
-#define __NNFW_UTIL_TENSOR_ZIPPER_H__
-
-#include "util/tensor/Index.h"
-#include "util/tensor/IndexIterator.h"
-#include "util/tensor/Reader.h"
-
-namespace nnfw
-{
-namespace util
-{
-namespace tensor
-{
-
-template <typename T> class Zipper
-{
-public:
-  Zipper(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs)
-    : _shape{shape}, _lhs{lhs}, _rhs{rhs}
-  {
-    // DO NOTHING
-  }
-
-public:
-  template <typename Callable> void zip(Callable cb) const
-  {
-    iterate(_shape) << [this, &cb] (const Index &index)
-    {
-      cb(index, _lhs.at(index), _rhs.at(index));
-    };
-  }
-
-private:
-  const Shape &_shape;
-  const Reader<T> &_lhs;
-  const Reader<T> &_rhs;
-};
-
-template<typename T, typename Callable>
-const Zipper<T> &operator<<(const Zipper<T> &zipper, Callable cb)
-{
-  zipper.zip(cb);
-  return zipper;
-}
-
-template<typename T>
-Zipper<T> zip(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs)
-{
-  return Zipper<T>{shape, lhs, rhs};
-}
-
-} // namespace tensor
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_TENSOR_ZIPPER_H__
diff --git a/libs/util/include/util/vector.h b/libs/util/include/util/vector.h
deleted file mode 100644
index 49a58a41e..000000000
--- a/libs/util/include/util/vector.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_VECTOR_H__
-#define __NNFW_UTIL_VECTOR_H__
-
-#include <vector>
-
-template <typename T> 
-bool operator==(const std::vector<T> &lhs, const std::vector<T> &rhs)
-{
-  if (lhs.size() != rhs.size())
-  {
-    return false;
-  }
-
-  for (size_t ind = 0; ind < lhs.size(); ++ind)
-  {
-    if (lhs.at(ind) != rhs.at(ind))
-    {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-#endif // __NNFW_UTIL_VECTOR_H__
diff --git a/libs/util/include/util/vector/Object.h b/libs/util/include/util/vector/Object.h
deleted file mode 100644
index b1bc521da..000000000
--- a/libs/util/include/util/vector/Object.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_VECTOR_OBJECT_H__
-#define __NNFW_UTIL_VECTOR_OBJECT_H__
-
-#include "util/vector/Reader.h"
-
-#include <vector>
-#include <functional>
-
-namespace nnfw
-{
-namespace util
-{
-namespace vector
-{
-
-template<typename T> class Object final : public Reader<T>
-{
-public:
-  using Generator = std::function<T (int32_t size, int32_t offset)>;
-
-public:
-  Object(int32_t size, const Generator &gen) : _size{size}
-  {
-    _value.resize(_size);
-
-    for (int32_t offset = 0; offset < size; ++offset)
-    {
-      _value.at(offset) = gen(size, offset);
-    }
-  }
-
-public:
-  int32_t size(void) const { return _size; }
-
-public:
-  T at(uint32_t nth) const override { return _value.at(nth); }
-
-private:
-  const int32_t _size;
-  std::vector<T> _value;
-};
-
-} // namespace vector
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_VECTOR_OBJECT_H__
diff --git a/libs/util/include/util/vector/Reader.h b/libs/util/include/util/vector/Reader.h
deleted file mode 100644
index a3c5cb359..000000000
--- a/libs/util/include/util/vector/Reader.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NNFW_UTIL_VECTOR_READER_H__
-#define __NNFW_UTIL_VECTOR_READER_H__
-
-#include <cstdint>
-
-namespace nnfw
-{
-namespace util
-{
-namespace vector
-{
-
-template <typename T> struct Reader
-{
-  virtual ~Reader() = default;
-
-  virtual T at(uint32_t nth) const = 0;
-};
-
-} // namespace vector
-} // namespace util
-} // namespace nnfw
-
-#endif // __NNFW_UTIL_VECTOR_READER_H__
diff --git a/libs/util/src/environment.cpp b/libs/util/src/environment.cpp
index dca6c5c55..4b18b409f 100644
--- a/libs/util/src/environment.cpp
+++ b/libs/util/src/environment.cpp
@@ -25,25 +25,23 @@ namespace nnfw
 namespace util
 {
 
-int get_env_int(const char* name)
+int get_env_int(const char *name, int defaultValue)
 {
   const char *value = std::getenv(name);
   if (value != nullptr)
     return std::stoi(value);
-  return 0;
+  return defaultValue;
 }
 
-bool get_env_bool(const char* name)
+bool get_env_bool(const char *name, bool defaultValue)
 {
   const char *value = std::getenv(name);
   if (value != nullptr)
   {
-    if (std::stoi(value))
-      return true;
-    if (!strcasecmp(value, "true"))
-      return true;
+    return std::stoi(value) != 0;
   }
-  return false;
+
+  return defaultValue;
 }
 
 } // namespace util
@@ -74,6 +72,24 @@ bool IntAccessor::access(int &out) const
   return true;
 }
 
+FloatAccessor::FloatAccessor(const std::string &tag) : _tag{tag}
+{
+  // DO NOTHING
+}
+
+bool FloatAccessor::access(float &out) const
+{
+  auto value = std::getenv(_tag.c_str());
+
+  if (value == nullptr)
+  {
+    return false;
+  }
+
+  out = std::stof(value);
+  return true;
+}
+
 } // namespace env
 } // namespace util
 } // namespace nnfw
diff --git a/libs/util/src/profiling/time.cc b/libs/util/src/profiling/time.cc
new file mode 100644
index 000000000..6fe1b54dc
--- /dev/null
+++ b/libs/util/src/profiling/time.cc
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "util/profiling/time.h"
+
+#include <sys/time.h>
+
+namespace tflite
+{
+namespace profiling
+{
+namespace time
+{
+uint64_t NowMicros()
+{
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+} // namespace time
+} // namespace profiling
+} // namespace tflite
diff --git a/libs/util/src/tensor/Comparator.cpp b/libs/util/src/tensor/Comparator.cpp
new file mode 100644
index 000000000..89cd687e9
--- /dev/null
+++ b/libs/util/src/tensor/Comparator.cpp
@@ -0,0 +1,40 @@
+#include "util/tensor/Comparator.h"
+#include "util/tensor/Zipper.h"
+
+#include "util/fp32.h"
+
+namespace nnfw
+{
+namespace util
+{
+namespace tensor
+{
+
+std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<float> &expected,
+                                             const Reader<float> &obtained,
+                                             Observer *observer) const
+{
+  std::vector<Diff<float>> res;
+
+  zip(shape, expected, obtained) <<
+      [&](const Index &index, float expected_value, float obtained_value) {
+        const auto relative_diff = nnfw::util::fp32::relative_diff(expected_value, obtained_value);
+
+        if (!_compare_fn(expected_value, obtained_value))
+        {
+          res.emplace_back(index, expected_value, obtained_value);
+        }
+
+        // Update max_diff_index, if necessary
+        if (observer != nullptr)
+        {
+          observer->notify(index, expected_value, obtained_value);
+        }
+      };
+
+  return res;
+}
+
+} // namespace tensor
+} // namespace util
+} // namespace nnfw
diff --git a/libs/util/src/tensor/Shape.cpp b/libs/util/src/tensor/Shape.cpp
index d177d1382..f1de26fdc 100644
--- a/libs/util/src/tensor/Shape.cpp
+++ b/libs/util/src/tensor/Shape.cpp
@@ -16,6 +16,8 @@
 
 #include "util/tensor/Shape.h"
 
+#include <cassert>
+
 namespace nnfw
 {
 namespace util
@@ -32,7 +34,7 @@ bool operator==(const Shape &lhs, const Shape &rhs)
 
   for (size_t axis = 0; axis < lhs.rank(); ++axis)
   {
-    if(lhs.dim(axis) != rhs.dim(axis))
+    if (lhs.dim(axis) != rhs.dim(axis))
     {
       return false;
     }
@@ -41,6 +43,57 @@ bool operator==(const Shape &lhs, const Shape &rhs)
   return true;
 }
 
+Shape Shape::from(const std::string &str)
+{
+  Shape shape(0);
+
+  bool pending = false;
+  int value = 0;
+
+  for (const char *cur = str.c_str(); true; ++cur)
+  {
+    if (*cur == ',' || *cur == '\0')
+    {
+      if (pending)
+      {
+        shape.append(value);
+      }
+
+      if (*cur == '\0')
+      {
+        break;
+      }
+
+      pending = false;
+      value = 0;
+      continue;
+    }
+
+    assert(*cur >= '0' && *cur <= '9');
+
+    pending = true;
+    value *= 10;
+    value += *cur - '0';
+  }
+
+  return shape;
+}
+
+std::ostream &operator<<(std::ostream &os, const Shape &shape)
+{
+  if (shape.rank() > 0)
+  {
+    os << shape.dim(0);
+
+    for (uint32_t axis = 1; axis < shape.rank(); ++axis)
+    {
+      os << "," << shape.dim(axis);
+    }
+  }
+
+  return os;
+}
+
 } // namespace tensor
 } // namespace util
 } // namespace nnfw