186 files changed, 23807 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt
new file mode 100644
index 000000000..aaebff758
--- /dev/null
+++ b/compute/ARMComputeEx/CMakeLists.txt
@@ -0,0 +1,32 @@
+nnas_find_package(ARMCompute QUIET)
+
+if(NOT ARMCompute_FOUND)
+  message(STATUS "Check ARM Compute library extension build: need ARM Compute library")
+  return()
+else(NOT ARMCompute_FOUND)
+  message(STATUS "Check ARM Compute library extension build: OK")
+endif(NOT ARMCompute_FOUND)
+
+set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR})
+
+file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
+
+# generate embeded cl_kernel
+execute_process (
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    COMMAND bash -c "python resolve_includes.py"
+)
+
+add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
+target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE})
+target_link_libraries(arm_compute_ex PRIVATE arm_compute)
+target_link_libraries(arm_compute_ex PRIVATE nnfw_common)
+target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage)
+# Defines to enable validate check in debug build
+target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS
+                                                  $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED
+                                                                    ARM_COMPUTE_LOGGING_ENABLED>)
+# Validate check functions are not used on release build
+# Some parameter are used for validate check function call, and these parameter may not used on release build
+target_compile_options(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:-Wno-unused-parameter -Wno-unused-function>)
+install(TARGETS arm_compute_ex DESTINATION lib)
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
new file mode 100644
index 000000000..e4e752ef9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLKernelLibraryEx.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file is a cloned version of CLKernelLibrary.h in ACL. This file defines
+ *            an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL.
+ */
+
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/**
+ * @brief Class to build OpenCL kernels added from nnfw
+ * */
+class CLKernelLibraryEx
+{
+  using StringSet = std::set<std::string>;
+
+private:
+  /**
+   * @brief Construct a new CLKernelLibraryEx object
+   */
+  CLKernelLibraryEx();
+
+public:
+  /**
+   * @brief Prevent instances of this class from being copied.
+   */
+  CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied.
+   */
+  const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
+
+  /**
+   * @brief Get the KernelLibrary singleton.
+   * @return The KernelLibrary instance
+   */
+  static CLKernelLibraryEx &get();
+
+  /**
+   * @brief Initialise the kernel library.
+   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+   * @param[in] context     CL context used to create programs.
+   * @param[in] device      CL device for which the programs are created.
+   * @return N/A
+   */
+  void init(std::string kernel_path, cl::Context context, cl::Device device)
+  {
+    _kernel_path = std::move(kernel_path);
+    _context = std::move(context);
+    _device = std::move(device);
+  }
+
+  /**
+   * @brief Set the path that the kernels reside in.
+   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+   * @return N/A
+   */
+  void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
+
+  /**
+   * @brief Get the path that the kernels reside in.
+   * @return the path of kernel files
+   */
+  std::string get_kernel_path() { return _kernel_path; };
+
+  /**
+   * @brief Get the source of the selected program.
+   * @param[in] program_name Program name.
+   * @return Source of the selected program.
+   */
+  std::string get_program_source(const std::string &program_name);
+
+  /**
+   * @brief Set the CL context used to create programs.
+   * @note Setting the context also resets the device to the
+   *       first one available in the new context.
+   * @param[in] context A CL context.
+   * @return N/A
+   */
+  void set_context(cl::Context context)
+  {
+    _context = std::move(context);
+    if (_context.get() == nullptr)
+    {
+      _device = cl::Device();
+    }
+    else
+    {
+      const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+      if (cl_devices.empty())
+      {
+        _device = cl::Device();
+      }
+      else
+      {
+        _device = cl_devices[0];
+      }
+    }
+  }
+
+  /**
+   * @brief Return associated CL context.
+   * @return A CL context.
+   */
+  cl::Context &context() { return _context; }
+
+  /**
+   * @brief Set the CL device for which the programs are created.
+   * @param[in] device A CL device.
+   * @return N/A
+   */
+  void set_device(cl::Device device) { _device = std::move(device); }
+
+  /**
+   * @brief Gets the CL device for which the programs are created.
+   * @return A CL device.
+   */
+  cl::Device &get_device() { return _device; }
+
+  /**
+   * @brief Return the device version
+   * @return The content of CL_DEVICE_VERSION
+   */
+  std::string get_device_version();
+
+  /**
+   * @brief Create a kernel from the kernel library.
+   * @param[in] kernel_name       Kernel name.
+   * @param[in] build_options_set Kernel build options as a set.
+   * @return The created kernel.
+   */
+  Kernel create_kernel(const std::string &kernel_name,
+                       const StringSet &build_options_set = {}) const;
+
+  /**
+   * @brief Find the maximum number of local work items in a workgroup can be supported for the
+   * kernel.
+   * @param[in] kernel       kernel object
+   */
+
+  size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
+  /**
+   * @brief Return the default NDRange for the device.
+   * @return default NDRangeof the device
+   */
+  cl::NDRange default_ndrange() const;
+
+  /**
+   * @brief Clear the library's cache of binary programs
+   * @return N/A
+   */
+  void clear_programs_cache()
+  {
+    _programs_map.clear();
+    _built_programs_map.clear();
+  }
+
+  /**
+   * @brief Access the cache of built OpenCL programs
+   * @return program map data structure of which key is name of kernel and value is
+   *         kerel source name. (*.cl)
+   */
+  const std::map<std::string, cl::Program> &get_built_programs() const
+  {
+    return _built_programs_map;
+  }
+
+  /**
+   * @brief Add a new built program to the cache
+   * @param[in] built_program_name Name of the program
+   * @param[in] program            Built program to add to the cache
+   * @return N/A
+   */
+  void add_built_program(const std::string &built_program_name, cl::Program program);
+
+  /**
+   * @brief Returns true if FP16 is supported by the CL device
+   * @return true if the CL device supports FP16
+   */
+  bool fp16_supported() const;
+
+  /**
+   * @brief Returns true if int64_base_atomics extension is supported by the CL device
+   * @return true if the CL device supports int64_base_atomics extension
+   */
+  bool int64_base_atomics_supported() const;
+
+private:
+  /**
+   * @brief Load program and its dependencies.
+   * @param[in] program_name Name of the program to load.
+   */
+  const Program &load_program(const std::string &program_name) const;
+  /**
+   * @brief Concatenates contents of a set into a single string.
+   * @param[in] s Input set to concatenate.
+   * @return Concatenated string.
+   */
+  std::string stringify_set(const StringSet &s) const;
+
+  cl::Context _context;     /**< Underlying CL context. */
+  cl::Device _device;       /**< Underlying CL device. */
+  std::string _kernel_path; /**< Path to the kernels folder. */
+  mutable std::map<std::string, const Program>
+      _programs_map; /**< Map with all already loaded program data. */
+  mutable std::map<std::string, cl::Program>
+      _built_programs_map; /**< Map with all already built program data. */
+  static const std::map<std::string, std::string>
+      _kernel_program_map; /**< Map that associates kernel names with programs. */
+  static const std::map<std::string, std::string>
+      _program_source_map; /**< Contains sources for all programs.
+                                Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
new file mode 100644
index 000000000..b98b174f7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgOperationKernel.h
+ * @brief This file defines CLArgOperationKernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the argop kernel.
+ */
+class CLArgOperationKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Default constructor.
+   */
+  CLArgOperationKernel();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+   */
+  CLArgOperationKernel(const CLArgOperationKernel &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+   * @return Reference of this instance
+   */
+  CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+   */
+  CLArgOperationKernel(CLArgOperationKernel &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+   * @return Reference of this instance
+   */
+  CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
+  /**
+   * @brief Initialise the kernel's input, output and border mode.
+   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output         The output tensor, Data types supported: S32.
+   * @param[in]  axis           Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in]  op             Arg operation to perform.
+   * return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   * CLArgOperationKernel
+   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in] output          The output tensor info, Data types supported: S32.
+   * @param[in] axis            Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in] op              Arg operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                         ArgOperation op);
+
+  /*
+   * @brief Run CLArgOperationKernel op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
new file mode 100644
index 000000000..ab33d9d3a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/
+class CLBinaryLogicalOpKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLBinaryLogicalOpKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input1  Source tensor1.
+   * @param[in]  input2  Source tensor2.
+   * @param[out] output  Output tensor.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
new file mode 100644
index 000000000..16cef0b61
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLCastKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLCastKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
+#define __ARM_COMPUTE_CLCASTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define OpenCL kernel for cast operation
+ */
+class CLCastKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct CLCastKernel object
+   */
+  CLCastKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLCastKernel(const CLCastKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLCastKernel &operator=(const CLCastKernel &) = delete;
+
+  /**
+   * @brief Construct CLCastKernel object using default move constructor
+   * @param[in] CLCastKernel object to move
+   */
+  CLCastKernel(CLCastKernel &&) = default;
+
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param[in] CLCastKernel object to move
+   */
+  CLCastKernel &operator=(CLCastKernel &&) = default;
+
+  /**
+   * @brief Destruct this CLCastKernel object
+   */
+  ~CLCastKernel() = default;
+
+  /**
+   * @brief Initialise the kernel's input and output.
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  input_subtype  Sub data type of input.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
new file mode 100644
index 000000000..60ec7a82a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform depthTospace operation */
+class CLDepthToSpaceKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLDepthToSpaceKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
+  /** Default destructor */
+  ~CLDepthToSpaceKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
new file mode 100644
index 000000000..da075db69
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLEmbeddingLookupKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLEmbeddingLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform EmbeddingLookup operation with opencl kernel
+*/
+class CLEmbeddingLookupKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct a CLEmbeddingLookupKernel object
+   * */
+  CLEmbeddingLookupKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete;
+
+  /**
+   * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor
+   * @param[in] CLEmbeddingLookupKernel object to move
+   * */
+  CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLEmbeddingLookupKernel object to move
+   * */
+  CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLEmbeddingLookupKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
+   * @param[in]  input          Source tensor.
+   *                            Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  lookups        Lookups are 1D tensor that values are indices into the first
+   *                            dimension of input.
+   *                            Data types supported: S32.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLEmbeddingLookupKernel
+   * @param[in]  input          The input tensor info.
+   *                            Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
+   * @param[in]  lookups        Lookups info. Data types supported: S32.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;   /** Source tensor */
+  ICLTensor *_output;        /** Destination tensor */
+  const ICLTensor *_lookups; /** Lookups tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
new file mode 100644
index 000000000..aa81a1efa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLGatherExKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLGatherExKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
+#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define an interface for the gather kernel.
+ */
+class CLGatherExKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct CLGatherExKernel object
+   * */
+  CLGatherExKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
+  CLGatherExKernel(const CLGatherExKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
+  CLGatherExKernel &operator=(const CLGatherExKernel &) = delete;
+
+  /**
+   * @brief Construct CLGatherExKernel object by using default move constructor
+   * @param[in] CLGatherExKernel object to move
+   */
+  CLGatherExKernel(CLGatherExKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLGatherExKernel object to move
+   */
+  CLGatherExKernel &operator=(CLGatherExKernel &&) = default;
+
+  /**
+   * @brief Initialise the kernel's input, output and border mode.
+   * @param[in]  input           An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices         Indices tensor. Data types supported: S32.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative
+   * values wrap around. Defaults to 0
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   * CLGatherExKernel
+   * @param[in]  input           An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices         Indices tensor. Data types supported: S32.
+   * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative
+   * values wrap around. Defaults to 0
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis = 0);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_indices;
+  ICLTensor *_output;
+  int _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
new file mode 100644
index 000000000..8269e5a7a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLHashtableLookupKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLHashtableLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform HashtableLookup operation with opencl kernel
+*/
+class CLHashtableLookupKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct a CLHashtableLookupKernel object
+   * */
+  CLHashtableLookupKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete;
+
+  /**
+   * @brief Construct a CLHashtableLookupKernel object by using default move constructor
+   * @param[in] CLHashtableLookupKernel object to move
+   * */
+  CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLHashtableLookupKernel object to move
+   * */
+  CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLHashtableLookupKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input,
+                 ICLTensor *output, ICLTensor *hits);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLHashtableLookupKernel
+   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
+   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    The input tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   The output tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
+   *                      hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_lookups{nullptr};                 /** Lookups tensor */
+  const ICLTensor *_keys{nullptr};                    /** Keys tensor */
+  const ICLTensor *_input{nullptr};                   /** Source tensor */
+  ICLTensor *_output{nullptr};                        /** Destination tensor */
+  ICLTensor *_hits{nullptr};                          /** Hits tensor */
+  std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
new file mode 100644
index 000000000..f5e147e03
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
+#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for performing an instance normalization */
+class CLInstanceNormalizationLayerKernelEx : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLInstanceNormalizationLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLInstanceNormalizationLayerKernelEx &
+  operator=(const CLInstanceNormalizationLayerKernelEx &) = delete;
+  /** Default Move Constructor. */
+  CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default;
+  /** Default move assignment operator */
+  CLInstanceNormalizationLayerKernelEx &
+  operator=(CLInstanceNormalizationLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLInstanceNormalizationLayerKernelEx() = default;
+
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported:
+   * NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      beta    (Optional) The offset tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
+                 ICLTensor *beta = nullptr, float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLInstanceNormalizationLayerEx.
+   *
+   * @param[in] input   Source tensor info. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                    Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] beta    (Optional) The offset tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  ICLTensor *_gamma;
+  ICLTensor *_beta;
+  float _epsilon;
+  bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
new file mode 100644
index 000000000..ccbea147e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
+#define __ARM_COMPUTE_CLNEGKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a negation operation on tensor*/
+class CLNegKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLNegKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLNegKernel(const CLNegKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLNegKernel &operator=(const CLNegKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLNegKernel(CLNegKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLNegKernel &operator=(CLNegKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input  Source tensor.
+   * @param[out] output Destination tensor.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
new file mode 100644
index 000000000..eff1b8bd5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
+#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to calculate PReLU*/
+class CLPReLUKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPReLUKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLPReLUKernel(const CLPReLUKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPReLUKernel(CLPReLUKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input  Source tensor1.
+   * @param[in]  alpha  Source tensor2.
+   * @param[out] output  Output tensor.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_alpha;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
new file mode 100644
index 000000000..a26a4a7fc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperationKernel.h
+ * @brief This file defines CLReduceOperationKernel class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the reduce operation kernel
+ */
+class CLReduceOperationKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Default constructor
+   */
+  CLReduceOperationKernel();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLReduceOperationKernel(const CLReduceOperationKernel &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   */
+  CLReduceOperationKernel(CLReduceOperationKernel &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   */
+  CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default;
+  /**
+   * @brief Default destructor
+   */
+  ~CLReduceOperationKernel() = default;
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input  Source tensor. Data types supported: U8/S32/F32.
+   * @param[out] output Destination tensor. Data types supported: Same as @p input.
+   *                    Output will have the same number of dimensions as input.
+   * @param[in]  axis   Axis along which to reduce.
+   * @param[in]  op     Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+                 ReduceOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperationKernel.
+   * @param[in] input  Source tensor info. Data types supported: U8/S32/F32.
+   * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+   *                   Output will have the same number of dimensions as input.
+   * @param[in] axis   Axis along which to reduce.
+   * @param[in] op     Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                         ReduceOperation op);
+
+  /*
+   * @brief Run CLReduceOperationKernel op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   CLQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
new file mode 100644
index 000000000..577e38cc4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */
+class CLSpaceToBatchNDKernel final : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSpaceToBatchNDKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default;
+  /** Default destructor */
+  ~CLSpaceToBatchNDKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
+   * @param[in]  input         Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                           Data layout supported: NCHW/NHWC
+   * @param[in]  block_size    Block size tensor. Data types supported: S32.
+   * @param[in]  padding_size  Padding size tensor. Data types supported: S32.
+   * @param[out]  output        Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input{nullptr};        /**< Source tensor */
+  const ICLTensor *_block_size{nullptr};   /**< Block size tensor */
+  const ICLTensor *_padding_size{nullptr}; /**< Padding size tensor */
+  ICLTensor *_output{nullptr};             /**< Destination tensor */
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
new file mode 100644
index 000000000..be845a549
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform spaceTodepth operation */
+class CLSpaceToDepthKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSpaceToDepthKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+  /** Default destructor */
+  ~CLSpaceToDepthKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
new file mode 100644
index 000000000..8da2daecc
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLTopKV2Kernel.h
+ * @brief This file defines classes for TopKV2Kernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+// these parameters can be changed
+#define _ITEMS 16                          // number of items in a group
+#define _GROUPS 4                          // the number of virtual processors is _ITEMS * _GROUPS
+#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram
+#define PERMUT                             // store the final permutation
+////////////////////////////////////////////////////////
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define CLTopKV2Single
+ */
+class CLTopKV2Single : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2Single();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+   */
+  CLTopKV2Single(const CLTopKV2Single &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+   */
+  CLTopKV2Single(CLTopKV2Single &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[in] input An input tensor
+   * @param[in] topk_values Values of the top k predictions
+   * @param[in] topk_indices Indices of the top k predictions
+   * @param[in] indices Indices
+   * @param[in] temp_stack Temp stack
+   * @param[in] k K of the top k predictions
+   * @param[in] n Number times to quick-sort
+   * return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+                 cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
+
+  /*
+   * @brief Run CLTopKV2Single op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_topk_values;
+  ICLTensor *_topk_indices;
+};
+
+/**
+ * @brief Class to define CLTopKV2Init
+ */
+class CLTopKV2Init : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2Init();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+   */
+  CLTopKV2Init(const CLTopKV2Init &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+   */
+  CLTopKV2Init(CLTopKV2Init &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[in] input An input tensor
+   * @param[in] in_key_buf Buffer of input key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[in] n Number times to quick-sort
+   * return N/A
+   */
+  void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
+
+  /*
+   * @brief Run CLTopKV2Init op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+};
+
+/**
+ * @brief Class to define CLRadixSortHistogram
+ */
+class CLRadixSortHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+   */
+  CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+   */
+  CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * @param[in] n Integer number size to sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, int bits, int n);
+
+  /**
+   * @brief Set pass
+   * @param[in] pass Passes made of in radix sort algorithm
+   * @param[in] in_key_buf Buffer of input key
+   * return N/A
+   */
+  void setPass(int pass, cl::Buffer *in_key_buf)
+  {
+    _pass = pass;
+    _in_key_buf = in_key_buf;
+  }
+
+  /*
+   * @brief Run CLRadixSortHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  int _pass;
+  cl::Buffer *_in_key_buf;
+};
+
+/**
+ * @brief Class to define CLRadixSortScanHistogram
+ */
+class CLRadixSortScanHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortScanHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+   */
+  CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+   */
+  CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+  /*
+   * @brief Run CLRadixSortScanHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortGlobalScanHistogram
+ */
+class CLRadixSortGlobalScanHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortGlobalScanHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+   */
+  CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+   */
+  CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
+  void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
+
+  /*
+   * @brief Run CLRadixSortGlobalScanHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortPasteHistogram
+ */
+class CLRadixSortPasteHistogram : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortPasteHistogram();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+   */
+  CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+   */
+  CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
+
+  /*
+   * @brief Run CLRadixSortPasteHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/**
+ * @brief Class to define CLRadixSortReorder
+ */
+class CLRadixSortReorder : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLRadixSortReorder();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+   */
+  CLRadixSortReorder(const CLRadixSortReorder &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+   * @return Reference of this instance
+   */
+  CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+   */
+  CLRadixSortReorder(CLRadixSortReorder &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+   * @return Reference of this instance
+   */
+  CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * @param[in] n Integer number size to sort
+   * return N/A
+   */
+  void configure(cl::Buffer *hist_buf, int bits, int n);
+
+  /**
+   * @brief Set pass
+   * @param[in] pass Passes made of in radix sort algorithm
+   * @param[in] in_key_buf Buffer of input key
+   * @param[out] out_key_buf Buffer of output key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
+  void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+               cl::Buffer *out_ind_buf)
+  {
+    _pass = pass;
+    _in_key_buf = in_key_buf;
+    _out_key_buf = out_key_buf;
+    _in_ind_buf = in_ind_buf;
+    _out_ind_buf = out_ind_buf;
+  }
+  /*
+   * @brief Run CLRadixSortReorder op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  int _pass;
+  cl::Buffer *_in_key_buf;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_in_ind_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2FindFirstNegative
+ */
+class CLTopKV2FindFirstNegative : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2FindFirstNegative();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+   */
+  CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+   */
+  CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] first_negative_idx_buf Buffer of the first negative index
+   * @param[in] n Number times to find
+   * return N/A
+   */
+  void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+  /**
+   * @brief Set output buffer
+   * @param[out] out_key_buf Buffer of output key
+   * return N/A
+   */
+  void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
+
+  /*
+   * @brief Run CLTopKV2FindFirstNegative op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  cl::Buffer *_out_key_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2ReorderNegatives
+ */
+class CLTopKV2ReorderNegatives : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2ReorderNegatives();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+   */
+  CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+   */
+  CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] first_negative_idx_buf Buffer of the first negative index
+   * @param[in] n Number times to find
+   * return N/A
+   */
+  void configure(cl::Buffer *first_negative_idx_buf, int n);
+
+  /**
+   * @brief Set buffers
+   * @param[in] in_key_buf Buffer of input key
+   * @param[out] out_key_buf Buffer of output key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
+  void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
+                  cl::Buffer *out_ind_buf)
+  {
+    _in_key_buf = in_key_buf;
+    _out_key_buf = out_key_buf;
+    _in_ind_buf = in_ind_buf;
+    _out_ind_buf = out_ind_buf;
+  }
+
+  /*
+   * @brief Run CLTopKV2ReorderNegatives op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  cl::Buffer *_in_key_buf;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_in_ind_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+/**
+ * @brief Class to define CLTopKV2Store
+ */
+class CLTopKV2Store : public ICLKernel
+{
+public:
+  /**
+   * @brief Constructor
+   */
+  CLTopKV2Store();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+   */
+  CLTopKV2Store(const CLTopKV2Store &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+   * @return Reference of this instance
+   */
+  CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+   */
+  CLTopKV2Store(CLTopKV2Store &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+   * @return Reference of this instance
+   */
+  CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
+
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] values Values tensor to store
+   * @param[out] indices Indices tensor to be used for store
+   * @param[in] k K of the top k predictions
+   * @param[in] n Number times to store
+   * return N/A
+   */
+  void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
+
+  /**
+   * @brief Set buffers
+   * @param[out] out_key_buf Buffer of output key
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
+  void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
+
+  /*
+   * @brief Run CLTopKV2Store op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_values;
+  ICLTensor *_indices;
+  cl::Buffer *_out_key_buf;
+  cl::Buffer *_out_ind_buf;
+};
+
+} // namespace arm_compute
+#endif // Disable GPU implementation
+#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
new file mode 100644
index 000000000..c5ef730b6
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
+ */
+class CLTransposeConvLayerUpsampleKernel : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTransposeConvLayerUpsampleKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsampleKernel &
+  operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
+  /** Default Move Constructor. */
+  CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
+  /** Default move assignment operator */
+  CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
+  /** Default destructor */
+  ~CLTransposeConvLayerUpsampleKernel() = default;
+
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input        Source tensor. Data types supported: QASYMM8/F16/F32.
+   * @param[out] output       Destination tensor. Data types supported: same as @p input. All but
+   * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
+   * performed within the XY-plane.
+   * @param[in]  inner_border Top and right inner border sizes. These rows and columns will be
+   * filled with zero.
+   * @param[in]  info         Contains padding and stride information described in @ref
+   * PadStrideInfo.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+                 const PadStrideInfo &info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLTransposeConvLayerUpsample
+   *
+   * @param[in] input        Source tensor info. Data types supported: QASYMM8/F16/F32.
+   * @param[in] output       Destination tensor info. Data types supported: same as @p input. All
+   * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
+   * only performed within the XY-plane.
+   * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
+   * with zero.
+   * @param[in] info         Contains padding and stride information described in @ref
+   * PadStrideInfo.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const BorderSize &inner_border, const PadStrideInfo &info);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  BorderSize _inner_border;
+  PadStrideInfo _info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
new file mode 100644
index 000000000..d093c22cb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** CPP kernel to perform tensor upsample.
+ *
+ */
+class CPPUpsampleKernelEx : public ICPPKernel
+{
+public:
+  const char *name() const override { return "CPPUpsampleKernelEx"; }
+  /** Default constructor */
+  CPPUpsampleKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
+  /** Default destructor */
+  ~CPPUpsampleKernelEx() = default;
+
+  /** Set the input and output of the kernel.
+   *
+   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+   * @param[out] output The output tensor. Data types supported: Same as @p input
+   * @param[in]  info   Padding info.
+   */
+  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+  bool is_parallelisable() const override;
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+  PadStrideInfo _info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
new file mode 100644
index 000000000..358e0ebc6
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
+#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+class ITensor;
+class Window;
+class QuantizationInfo;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
+                             const float32x4_t &scale);
+
+void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
+                     const float32x4_t &invscale);
+
+float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale);
+
+void elementwise_op_quantized(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+    uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
+    int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
+                          float32x4_t, float32x4_t, float32x4_t, const bool),
+    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
+                     int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t));
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    float (*scalar_func)(const float &, const float &),
+                    int (*broadcast_func)(int, int, int, const float *, const float &, float *,
+                                          const bool),
+                    int (*neon_func)(int, int, int, const float *, const float *, float *));
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
+                    int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
+                                          uint8_t *, const bool),
+                    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *));
+} // namespace arm_compute
+#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
new file mode 100644
index 000000000..61992bd50
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+
+class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
+{
+public:
+  /** Default destructor */
+  ~NEBinaryLogicalOperationKernel() = default;
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] op     Binary logical operation to be executed.
+   * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor. Data types supported: Same as @p input1.
+   */
+  void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] op     Binary logical operation to be executed.
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   *
+   * @return a Status
+   */
+  static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1,
+                         const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+  // Inherited methods overridden:
+  static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
+                                   const ITensorInfo &output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
new file mode 100644
index 000000000..fd2a2ee3b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
+#define __ARM_COMPUTE_NECASTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the cast layer kernel. */
+class NECastKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NECastKernel"; }
+  /** Default constructor */
+  NECastKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NECastKernel(const NECastKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NECastKernel &operator=(const NECastKernel &) = delete;
+  /** Default Move Constructor. */
+  NECastKernel(NECastKernel &&) = default;
+  /** Default move assignment operator */
+  NECastKernel &operator=(NECastKernel &&) = default;
+  /** Default destructor */
+  ~NECastKernel() = default;
+  /** Set input, output tensors.
+   *
+   * @param[in]  input  Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+   * U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in]  input_subtype  Sub data type of input.
+   */
+  void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in] input_subtype  Sub data type of input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         SubDataType input_subtype);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+  SubDataType _input_subtype;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
new file mode 100644
index 000000000..5b6ef6bfb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depth to space kernel */
+class NEDepthToSpaceLayerKernelEx : public INEKernel
+{
+public:
+  const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
+  /** Default constructor */
+  NEDepthToSpaceLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~NEDepthToSpaceLayerKernelEx() = default;
+  /** Initialise the kernel's inputs and output.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   * @param[in]  block_shape Block shape x value.
+   */
+  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEDepthToSpaceLayerKernelEx.
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   * @param[in] block_shape Block shape value.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input; /**< Source tensor */
+  ITensor *_output;      /**< Destination tensor */
+  int32_t _block_shape;  /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
new file mode 100644
index 000000000..d6fad1155
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for an element-wise unary operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ output(x) = OP(input(x))@f]
+ *
+ */
+class NEElementwiseUnaryKernelEx : public INEKernel
+{
+public:
+  const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
+  /** Default constructor */
+  NEElementwiseUnaryKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
+  /** Default destructor */
+  ~NEElementwiseUnaryKernelEx() = default;
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEElementwiseUnaryKernelEx
+   *
+   * @param[in] op     Arithmetic operation to be executed.
+   * @param[in] input  First tensor input. Data types supported: F16/F32/S32.
+   * @param[in] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEElementwiseUnaryKernelEx
+   *
+   * @param[in] op     Arithmetic operation to be executed.
+   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   *
+   * @return a Status
+   */
+  static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+                         const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+  /** Common signature for all the specialised arithmetic functions
+   *
+   * @param[in]  input  An input tensor. Data types supported: F16/F32/S32.
+   * @param[out] output The output tensor. Data types supported: Same as @p input.
+   * @param[in]  window Region on which to execute the kernel.
+   */
+  using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
+                                        const Window &window);
+
+protected:
+  // Inherited methods overridden:
+  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
+
+  /** Function to use for the particular tensor types passed to configure() */
+  std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
+
+  const ITensor *_input;
+  ITensor *_output;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
new file mode 100644
index 000000000..1490e75f2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform EmbeddingLookup operation */
+class NEEmbeddingLookupKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEEmbeddingLookupKernel"; }
+  /** Default constructor */
+  NEEmbeddingLookupKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input   Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output  Destination tensor. Data types supported: same as @p input.
+   * @param[in]  lookups Lookups are 1D tensor that values are indices into the first dimension of
+   * input.
+   */
+  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEEmbeddingLookupKernel
+   *
+   * @param[in] input   Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output  Destination tensor. Data types supported: same as @p input.
+   * @param[in] lookups Lookups info. Data types supported: S32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input;
+  const ITensor *_lookups;
+  ITensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
new file mode 100644
index 000000000..3fa9c6e9a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
+#define __ARM_COMPUTE_NEGATHERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform other operation on NEON */
+class NEGatherKernelEx : public INEKernel
+{
+public:
+  /** Default constructor. */
+  NEGatherKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEGatherKernelEx(const NEGatherKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete;
+  /** Allow instances of this class to be moved. */
+  NEGatherKernelEx(NEGatherKernelEx &&) = default;
+  /** Allow instances of this class to be moved. */
+  NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default;
+  /** Default detructor */
+  ~NEGatherKernelEx() = default;
+
+  /** Name of the kernel
+   *
+   * @return Kernel name
+   */
+  const char *name() const override { return "NEGatherKernelEx"; }
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values
+   * wrap around. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values
+   * wrap around. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Implementation of the gather operation for 0 axis.
+   *
+   * For gather on the 0 axis an element by element copy is performed.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info);
+
+  /** Implementation of the gather operation.
+   *
+   * For 1<=axis a row-wise copy is taking place.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info);
+
+  using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info);
+
+  const ITensor *_input;
+  const ITensor *_indices;
+  int _axis;
+  ITensor *_output;
+  kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
new file mode 100644
index 000000000..d8976e7d0
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HashtableLookup operation */
+class NEHashtableLookupKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEHashtableLookupKernel"; }
+  /** Default constructor */
+  NEHashtableLookupKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default;
+  /** Initialize the kernel's inputs, outputs.
+   *
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   * input. Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   * (True) or not (False). Data types supported: U8/QASYMM8
+   * input.
+   */
+  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+                 ITensor *hits);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEHashtableLookupKernel
+   *
+   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
+   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    The input tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   The output tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
+   * hits (True) or not (False). Data types supported: U8/QASYMM8
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_lookups; /** Lookups tensor */
+  const ITensor *_keys;    /** Keys tensor */
+  const ITensor *_input;   /** Source tensor */
+  ITensor *_output;        /** Destination tensor */
+  ITensor *_hits;          /** Hits tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
new file mode 100644
index 000000000..76e2587af
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for performing an instance normalization */
+class NEInstanceNormalizationLayerKernelEx : public INEKernel
+{
+public:
+  const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; }
+  /** Default constructor */
+  NEInstanceNormalizationLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEInstanceNormalizationLayerKernelEx &
+  operator=(const NEInstanceNormalizationLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NEInstanceNormalizationLayerKernelEx &
+  operator=(NEInstanceNormalizationLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~NEInstanceNormalizationLayerKernelEx() = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported:
+   * NCHW
+   *                         In case of @p output tensor = nullptr this tensor will store the result
+   * of the normalization.
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
+   * Defaults to 1.0
+   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr,
+                 float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEInstanceNormalizationLayer.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
+   * to 1.0
+   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Common signature for all the specialized instance normalization functions
+   *
+   * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   * @param[out]     output  The output tensor.
+   * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to
+   * 1.0
+   * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to
+   * 0.0
+   * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
+   */
+  using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                                     float epsilon, const Window &window);
+
+  NormalizationFunction *_func;
+  ITensor *_input;
+  ITensor *_output;
+  ITensor *_gamma;
+  ITensor *_beta;
+  float _epsilon;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
new file mode 100644
index 000000000..723b14523
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
+#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to multiply scale factor kernel. */
+class NEMultiplyScaleFactorKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEMultiplyScaleFactorKernel"; }
+  /** Default constructor */
+  NEMultiplyScaleFactorKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete;
+  /** Default Move Constructor. */
+  NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default;
+  /** Default move assignment operator */
+  NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default;
+  /** Default destructor */
+  ~NEMultiplyScaleFactorKernel() = default;
+  /** Set input, output tensors.
+   *
+   * @param[in/out] input  Source tensor. Data type supported: S32.
+   * @param[in]     scale_factor Scale tensor. Data type supported: F16/F32.
+   * @param[out]    output Destination tensor. Data type supported: Same as @p scale_factor.
+   */
+  void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output,
+                 float multiplier = 1.f);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEMultiplyScaleFactorKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: S32.
+   * @param[in] scale_factor Scale tensor. Data type supported: F16/F32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                         const ITensorInfo *output, float multiplier = 1.f);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  template <typename T> void multiply(const Window &window);
+
+private:
+  const ITensor *_input;
+  const ITensor *_scale_factor;
+  ITensor *_output;
+  float _multiplier;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
new file mode 100644
index 000000000..79bb78661
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
+#define __ARM_COMPUTE_NEPRELUKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform Parametric Rectified Linear Unit
+ *
+ * Result is computed by:
+ * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
+ */
+class NEPReLUKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEPReLUKernel"; }
+  /** Default constructor */
+  NEPReLUKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEPReLUKernel(const NEPReLUKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEPReLUKernel(NEPReLUKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
+  /** Initialise the kernel's inputs and output
+   *
+   * @param[in]  input Input tensor. Data type supported: QASYMM8/F32
+   * @param[in]  alpha Alpha tensor. Data types supported: Same as @p input
+   * @param[out] output Output tensor. Data types supported: Same as @p input
+   */
+  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEPReLUKernel.h
+   *
+   * @param[in] input  Input tensor input info. Data types supported: QASYMM8/F32.
+   * @param[in] alpha  Alpha tensor input info. Data types supported: Same as @p input.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   *
+   * @return a Status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
+                         const ITensorInfo *output);
+  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+                                   const ITensorInfo &output);
+
+private:
+  const ITensor *_input; /**< Source tensor */
+  const ITensor *_alpha; /**< Alpha tensor */
+  ITensor *_output;      /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
new file mode 100644
index 000000000..590b23873
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
+#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the dequantization layer kernel. */
+class NEQuantizationSymmetricKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEQuantizationSymmetricKernel"; }
+  /** Default constructor */
+  NEQuantizationSymmetricKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete;
+  /** Default Move Constructor. */
+  NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default;
+  /** Default move assignment operator */
+  NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default;
+  /** Default destructor */
+  ~NEQuantizationSymmetricKernel() = default;
+  /** Set input, output tensors.
+   *
+   * @param[in]  input  Source tensor. Data type supported: F16/F32.
+   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+   * S8.
+   * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output, ITensor *scale_factor);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEQuantizationSymmetricKernel
+   *
+   * @param[in] input  Input tensor info. Data types supported: F16/F32.
+   * @param[in] output Output tensor info. Data types supported: S8.
+   * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *scale_factor);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  template <typename T> void quantize(const Window &window);
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+  ITensor *_scale_factor;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
new file mode 100644
index 000000000..73991b67d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
+#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a reduction operation */
+class NEReductionOperationKernelEx : public INEKernel
+{
+public:
+  const char *name() const override { return "NEReductionOperationKernelEx"; }
+  /** Default constructor */
+  NEReductionOperationKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default;
+  /** Default destructor */
+  ~NEReductionOperationKernelEx() = default;
+
+  /** Set the source, destination of the kernel
+   *
+   * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported:
+   * NCHW.
+   * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input.
+   *                    Output will have the same number of dimensions as input.
+   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
+   * @param[in]  op     Reduction operation to perform.
+   */
+  void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReductionOperationKernelEx.
+   *
+   * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts
+   * supported: NCHW.
+   * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p
+   * input.
+   *                   Output will have the same number of dimensions as input.
+   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
+   * @param[in] op     Reduction operation to perform.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+                         ReduceOperation op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+  BorderSize border_size() const override;
+
+private:
+  const ITensor *_input;
+  ITensor *_output;
+  unsigned int _reduction_axis;
+  ReduceOperation _op;
+  BorderSize _border_size;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
new file mode 100644
index 000000000..5d697c2b2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the space to depth kernel */
+class NESpaceToDepthLayerKernelEx : public INEKernel
+{
+public:
+  const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
+  /** Default constructor */
+  NESpaceToDepthLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~NESpaceToDepthLayerKernelEx() = default;
+  /** Initialise the kernel's inputs and output.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   * @param[in]  block_shape Block shape value
+   */
+  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToDepthLayerKernelEx
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   * @param[in] block_shape Block shape value
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  const ITensor *_input; /**< Source tensor */
+  ITensor *_output;      /**< Destination tensor */
+  int32_t _block_shape;  /**< Block shape */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
new file mode 100644
index 000000000..3b0902f08
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_TYPESEX_H__
+#define __ARM_COMPUTE_TYPESEX_H__
+
+namespace arm_compute
+{
+
+/** Available ArgIndex operations **/
+enum class ArgOperation
+{
+  MAX,
+  MIN,
+};
+
+/** Available reduce operations */
+enum class ReduceOperation
+{
+  MAX,  /**< Max */
+  MEAN, /**< Mean */
+  SUM,  /**< Sum */
+  MIN,  /**< Min */
+};
+
+/** Available binary logical operations */
+enum class BinaryLogicalOperation
+{
+  AND, /**< AND */
+  OR,  /**< OR */
+};
+
+enum class ComparisonOperationEx
+{
+  EQUAL,     /**< EQUAL */
+  NOT_EQUAL, /**< NOT_EQUAL */
+};
+
+enum class ElementWiseUnaryEx
+{
+  NEG, /**< NEG */
+};
+
+enum class SubDataType
+{
+  NONE,
+  BOOL,
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TYPESEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
new file mode 100644
index 000000000..39026e6bb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_UTILSEX_H__
+#define __ARM_COMPUTE_UTILSEX_H__
+
+#include <utility>
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+
+/** Returns expected width and height of the transpose convolution's output tensor.
+ *
+ * @note This function was copied in order to fix a bug computing to wrong output dimensions.
+ *
+ * @param[in] in_width      Width of input tensor (Number of columns)
+ * @param[in] in_height     Height of input tensor (Number of rows)
+ * @param[in] kernel_width  Kernel width.
+ * @param[in] kernel_height Kernel height.
+ * @param[in] info          padding and stride info.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_top   The number of zeros added to bottom edge of the output.
+ *
+ * @return A pair with the new width in the first position and the new height in the second.
+ */
+const std::pair<unsigned int, unsigned int>
+transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                unsigned int kernel_width, unsigned int kernel_height,
+                                const PadStrideInfo &info, unsigned int invalid_right,
+                                unsigned int invalid_top);
+}
+#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
new file mode 100644
index 000000000..16fd40ed9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace shape_calculator
+{
+
+/** Calculate the upsampled output shape used for transpose convolution
+ *
+ * @param[in] input              Input tensor info
+ * @param[in] weights            Weights tensor shape
+ * @param[in] info               Padding and stride info
+ * @param[in] out_dims           Output shape dimensions
+ * @param[in] invalid_right      The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom     The number of zeros added to bottom edge of the output.
+ * @param[out] pad_left          Padding on left
+ * @param[out] pad_right         Padding on right
+ * @param[out] pad_top           Padding on top
+ * @param[out] pad_bottom        Padding on bottom
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_transposeconv_upsampled_shape(
+    const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+    std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
+    unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
+    unsigned int &pad_top, unsigned int &pad_bottom)
+{
+  unsigned int sx = info.stride().first;
+  unsigned int sy = info.stride().second;
+  const DataLayout data_layout = input.data_layout();
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  // Find the upsampled dimensions
+  // transpose conv out:
+  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
+  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
+  // upsample out:
+  //    upsample_out = 1 + (in - 1) * stride
+  unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
+  unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+
+  // Find the padding needed for the convolution with stride 1 in order to match output shape
+  // upsample+pad out:
+  //    upsample_out + pad = tconv_out + kernel - 1
+  //    pad = tconv_out + kernel - 1 - upsample_out
+  unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
+  unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
+  out_x += padx;
+  out_y += pady;
+
+  unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
+  unsigned int pady_all_except_invallid =
+      pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
+  pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
+  pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
+  pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
+  pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom;
+
+  TensorShape scale_out_shape(input.tensor_shape());
+  scale_out_shape.set(idx_w, out_x);
+  scale_out_shape.set(idx_h, out_y);
+
+  return scale_out_shape;
+}
+
+/** Calculate the output shape of the transpose convolution layer
+ *
+ * @param[in] out_dims Output x and y shape dimensions
+ * @param[in] input    Input tensor info
+ * @param[in] weights  Weights tensor shape
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                   const ITensorInfo &input, const ITensorInfo &weights)
+{
+  const TensorShape input_shape{input.tensor_shape()};
+  const TensorShape weights_shape{weights.tensor_shape()};
+
+  const DataLayout data_layout = input.data_layout();
+  const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int channel_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+  TensorShape out_shape{input_shape};
+  out_shape.set(width_idx, out_dims.first);
+  out_shape.set(height_idx, out_dims.second);
+  out_shape.set(channel_idx, weights_shape[batch_idx]);
+  return out_shape;
+}
+
+/** Calculate the depth to space output shape of a tensor
+ *
+ * @param[in] input Input tensor info
+ * @param[in] block Block shape value
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block)
+{
+  ARM_COMPUTE_ERROR_ON(block < 2);
+
+  const DataLayout data_layout = input->data_layout();
+  const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int idx_channel =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  TensorShape output_shape{input->tensor_shape()};
+  output_shape.set(idx_width, input->dimension(idx_width) * block);
+  output_shape.set(idx_height, input->dimension(idx_height) * block);
+  output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
+
+  return output_shape;
+}
+
+/** Calculate the space to batch output shape of a tensor
+ *
+ * @param[in] input       Input tensor info
+ * @param[in] block_shape Block shape value
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape)
+{
+  ARM_COMPUTE_ERROR_ON(block_shape < 2);
+  TensorShape output_shape{input->tensor_shape()};
+
+  const DataLayout data_layout = input->data_layout();
+  const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape);
+  output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape);
+  output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape));
+
+  return output_shape;
+}
+
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis   The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
+                                           const TensorShape &indices_shape, uint32_t actual_axis)
+{
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
+  ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+
+  TensorShape output_shape = input_shape;
+  if (indices_shape.num_dimensions() == 1)
+  {
+    output_shape[actual_axis] = indices_shape[0];
+  }
+  else if (indices_shape.num_dimensions() > 1)
+  {
+    output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+    {
+      if (o == actual_axis)
+      {
+        ++i;
+        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+        {
+          output_shape[o] = indices_shape[in];
+        }
+      }
+      else
+      {
+        output_shape[o] = input_shape[i];
+      }
+    }
+  }
+  return output_shape;
+}
+
+} // namespace shape_calculator
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
new file mode 100644
index 000000000..831bb5423
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
+#define __ARM_COMPUTE_CLFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
+#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
+#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+#include <arm_compute/runtime/CL/functions/CLCast.h>
+#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
+#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
+#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
+#include <arm_compute/runtime/CL/functions/CLNeg.h>
+#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
+#include <arm_compute/runtime/CL/functions/CLPReLU.h>
+#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
+#include <arm_compute/runtime/CL/functions/CLSpaceToBatchND.h>
+#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
+#include <arm_compute/runtime/CL/functions/CLSplit.h>
+#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
+#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
+#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
new file mode 100644
index 000000000..d9d0d4d35
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLArgOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
+#define __ARM_COMPUTE_CLARGOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute CLArgOperation operation
+ */
+class CLArgOperation : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new CLArgOperation object
+   */
+  CLArgOperation();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLArgOperation(const CLArgOperation &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLArgOperation &operator=(const CLArgOperation &) = delete;
+
+  /**
+   * @brief Construct a new CLArgOperation object by using copy constructor
+   * @param[in] CLArgOperation object to move
+   */
+  CLArgOperation(CLArgOperation &&) = default;
+
+  /**
+   * @brief Assign a CLArgOperation object.
+   * @param[in] CLArgOperation object to assign. This object will be moved.
+   */
+  CLArgOperation &operator=(CLArgOperation &&) = default;
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output    The result of arg operation. Data types supported: S32.
+   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in]  op        Arg operation to perform.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[out] output    The result of arg operation. Data types supported: S32.
+   * @param[in]  op        Arg operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+                         const ITensorInfo *output, ArgOperation op);
+  /**
+   * @brief Run the OpenCL kernel for this operation
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  ICLTensor *_input{nullptr};
+  ICLTensor *_output{nullptr};
+  std::vector<uint32_t> _axis{};
+  ArgOperation _arg_op{ArgOperation::MAX};
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
+  size_t _num_of_kernels{0};
+};
+}
+#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
new file mode 100644
index 000000000..d16a0762d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBatchToSpaceNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLBatchToSpaceND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  block_size         A pointer to an array of integer values specifying block sizes
+   *                                for spatial dimension.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
new file mode 100644
index 000000000..061e34f26
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLBinaryLogicalOp : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported: U8, QASYMM8.
+   * @param[in]  input2  Source tensor2. Data types supported: U8 QASYMM8.
+   * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
new file mode 100644
index 000000000..36acfaed7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLCast.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCast class
+ */
+
+#ifndef __ARM_COMPUTE_CLCAST_H__
+#define __ARM_COMPUTE_CLCAST_H__
+
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLCastKernel.
+ * This converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLCast : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's input and output
+   * @param[in, out] input    Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   *                          The input tensor is [in, out] because its TensorInfo might be
+   *                          modified inside the kernel.
+   * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]      input_subtype  Sub data type of input.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+};
+}
+#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
new file mode 100644
index 000000000..d78a6ada4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthToSpaceKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLDepthToSpace : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+} // namesace arm_compute
+
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
new file mode 100644
index 000000000..257772a89
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class CLEmbeddingLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..fd0a65f20
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        CLFullyConnectedReshapingLayer.h
+ * @brief       This file contains CLFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class CLFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+  CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+      : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+        _cl_fc{memory_manager}, _cl_reshape{}, _needs_reshape(false)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] weights The tensor that is filled with weight values
+   * @param[in] biases The tensor that is filled with biase values
+   * @param[in] output The destination tensor
+   * @param[in] needs_reshape Whether it needs to be reshaped or not
+   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+   * @return N/A
+   */
+  void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights,
+                 const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output,
+                 bool needs_reshape, const arm_compute::TensorShape &reshape);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+  /**
+   * @brief Prepare the operation
+   * @return N/A
+   */
+  void prepare(void) override;
+
+private:
+  const arm_compute::ICLTensor *_input;
+  const arm_compute::ICLTensor *_weights;
+  const arm_compute::ICLTensor *_biases;
+  arm_compute::ICLTensor *_output;
+
+  // buffer for reshaping input tensor
+  arm_compute::CLTensor _cl_buffer;
+
+private:
+  arm_compute::CLFullyConnectedLayer _cl_fc;
+  // TODO Change to CLReshapeLayer
+  arm_compute::misc::GenericReshapeLayer _cl_reshape;
+  bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
new file mode 100644
index 000000000..04d227aa7
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLGatherEx.h
+ * @brief       This file contains CLGatherEx class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLGATHEREX_H__
+#define __ARM_COMPUTE_CLGATHEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to to run @ref CLGatherKernel.
+ */
+class CLGatherEx : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in]  input   An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices An indexes tensor. Data types supported: S32.
+   * @param[out] output  The output tensor, Data types supported: same as @p input.
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   * @return N/A
+ */
+  void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   *        of @ref CLGatherEx
+   * @param[in]  input   An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  indices An indexes tensor. Data types supported: S32.
+   * @param[out] output  The output tensor, Data types supported: same as @p input.
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
new file mode 100644
index 000000000..65aa6cbd5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class CLHashtableLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
+                 ICLTensor *output, ICLTensor *hits);
+};
+}
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..ed29db925
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref CLInstanceNormalizationLayerKernelEx
+ */
+class CLInstanceNormalizationLayerEx : public ICLSimpleFunction
+{
+public:
+  /** Default constructor */
+  CLInstanceNormalizationLayerEx();
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      beta    (Optional) The offset tensor applied to the normalized tensor. Defaults
+   * to nullptr
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr,
+                 ICLTensor *beta = nullptr, float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLInstanceNormalizationLayerEx.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] beta    (Optional) The offset tensor applied to the normalized tensor. Defaults to
+   * nullptr
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
new file mode 100644
index 000000000..4bf203c5a
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
+#define __ARM_COMPUTE_CLLOGICALNOT_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLLogicalNot : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input  Source tensor. Data types supported: QASYMM8.
+   * @param[out] output Output tensor. Data types supported: QASYMM8.
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
new file mode 100644
index 000000000..198a0fd4e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEG_H__
+#define __ARM_COMPUTE_CLNEG_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLNeg : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input  Source tensor. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
new file mode 100644
index 000000000..622a61b5e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_H__
+#define __ARM_COMPUTE_CLPRELU_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLPReLU : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in]  alpha. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
new file mode 100644
index 000000000..b142d3a2e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLPixelWiseDivision.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLPixelWiseDivision class
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLPixelWiseDivisionKernel.
+ */
+class CLPixelWiseDivision : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32
+   *                                 The input tensor is [in, out] because its TensorInfo might be
+   * modified inside the kernel in case of broadcasting of dimension 0.
+   * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+   *                                 The input tensor is [in, out] because its TensorInfo might be
+   * modified inside the kernel in case of broadcasting of dimension 0.
+   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+   * Note: U8 requires both inputs to be U8.
+   * @param[in]      scale           Scale to apply after multiplication.
+   *                                 Scale must be positive and its value must be either 1/255 or
+   * 1/2^n where n is between 0 and 15.
+   * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+   * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+   * even.
+   * @return N/A
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
+                 ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+                 RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   * CLPixelWiseDivision
+   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32
+   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * Note: U8 requires both inputs to be U8.
+   * @param[in] scale           Scale to apply after multiplication.
+   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
+   * where n is between 0 and 15.
+   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, float scale = 1.f,
+                         ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+                         RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+};
+}
+#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
new file mode 100644
index 000000000..7e88cb369
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLRNNLayerEx */
+class CLRNNLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Initialize the function
+   *
+   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+   * types supported: F16/F32
+   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
+   * multiplies the input. Data types supported: Same as @p input
+   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+   * the current 'state'. Data types supported: Same as @p input
+   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
+   * as @p input
+   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in]     info              Activation layer parameter.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *weights,
+                 const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
+                 ICLTensor *output, ActivationLayerInfo &info);
+  /** Initialize the function
+   *
+   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+   * types supported: F16/F32
+   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
+   * the input. Data types supported: Same as @p input
+   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+   * current 'state'. Data types supported: Same as @p input
+   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
+   * input
+   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in] info              Activation layer parameter.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                         const ITensorInfo *hidden_state, const ITensorInfo *output,
+                         const ActivationLayerInfo &info);
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  CLMemoryGroup _memory_group;
+  CLGEMM _gemm_state_f;
+  CLSaturatedArithmeticOperationKernel _add_kernel;
+  CLActivationLayerKernel _activation_kernel;
+  CLFullyConnectedLayer _fully_connected_kernel;
+  CLCopyKernel _copy_kernel;
+  CLTensor _fully_connected_out;
+  CLTensor _gemm_output;
+  CLTensor _add_output;
+  bool _is_prepared;
+};
+}
+#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
new file mode 100644
index 000000000..1d367d56b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLReduceOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform ReduceOperation
+ */
+class CLReduceOperation : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new ReduceOperation object
+   */
+  CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager);
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input     Source tensor. Data types supported: U8/S32/F32
+   * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in]  keep_dims If positive, retains reduced dimensions with length 1.
+   * @param[in]  op        Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
+                 bool keep_dims, ReduceOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperation.
+   * @param[in] input     Source tensor info. Data types supported: U8/S32/F32
+   * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in] axis      Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in] keep_dims If positive, retains reduced dimensions with length 1.
+   * @param[in] op        Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op);
+
+  /**
+   * @brief Run the OpenCL kernel for this operation
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  CLMemoryGroup _memory_group;
+  ICLTensor *_input;
+  ICLTensor *_output;
+  std::set<uint32_t> _axis;
+  bool _keep_dims;
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
+  CLReshapeLayer _reshape;
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
new file mode 100644
index 000000000..7e2df8986
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToBatchNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32.
+ * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape
+ * block_shape, and interleaves these blocks with the "batch" dimension such that in the output.
+ */
+class CLSpaceToBatchND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
+   * @param[in]  input          Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
+   * @param[in]  block_size     Tensor of integer values specifying block sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[in]  padding_size   Tensor of integer values specifying padding sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[out] output         Output tensor. Data types supported: same as @p input.
+   *                            Data layout supported: NCHW/NHWC
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
new file mode 100644
index 000000000..17f762092
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToDepthKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLSpaceToDepth : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
new file mode 100644
index 000000000..6b26a85c8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLStridedSlice.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLStridedSliceKernel
+ */
+class CLStridedSliceEx : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's inputs and outputs
+   * @param[in]  input   Tensor input. Data type supported:
+   *                     U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output  Output tensor. Data type supported: Same as @p input
+   * @param[in]  beginData 'begin' vector of strided slice operation
+   * @param[in]  endData   'end' vector of strided slice operation
+   * @param[in]  stridesData 'strides' vector of strided slice operation
+   * @param[in]  beginMask  If the ith bit is set, begin[i] is ignored
+   * @param[in]  endMask    If the ith bit is set, end[i] is ignored
+   * @param[in]  shrinkAxisMask  If the ith bit is set, the ith specification shrinks the
+   *                             dimensionality by 1, taking on the value at index begin[i]
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                 int32_t shrinkAxisMask);
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
new file mode 100644
index 000000000..20c749e0b
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLTopKV2.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLTopKV2 class
+ */
+#ifndef __ARM_COMPUTE_CLTOPK_V2_H__
+#define __ARM_COMPUTE_CLTOPK_V2_H__
+
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute TopKV2 operation.
+ */
+class CLTopKV2 : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new CLTopKV2 object
+   */
+  CLTopKV2();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLTopKV2(const CLTopKV2 &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLTopKV2 &operator=(const CLTopKV2 &) = delete;
+
+  /**
+   * @brief Construct a new CLTopKV2 object by using copy constructor
+   * @param[in] CLTopKV2 object to move
+   */
+  CLTopKV2(CLTopKV2 &&) = default;
+
+  /**
+   * @brief Assign a CLTopKV2 object.
+   * @param[in] CLTopKV2 object to assign. This object will be moved.
+   */
+  CLTopKV2 &operator=(CLTopKV2 &&) = default;
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
+   * @param[in]  input     Input image. Data types supported: U8/S16/F32.
+   * @param[in]  k         The value of `k`.
+   * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if
+   * input type is F32.
+   * @param[out] indices   Indices related to top k values. Data types supported: S32 if input type
+   * is U8/S16, F32 if input type is F32.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                 int total_bits = 32, int bits = 4);
+
+  /**
+   * @brief Run the kernels contained in the function
+   * Depending on the value of the following environment variables it works differently:
+   *   - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
+   *     quick sort on GPU is used.
+   *   - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
+   *     radix sort on GPU is used.
+   *   - For other value, TopKV2 runs on CPU
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  void run_on_cpu();
+  void run_on_gpu();
+  void run_on_gpu_single_quicksort();
+
+  uint32_t _k;
+  uint32_t _total_bits;
+  uint32_t _bits;
+  uint32_t _radix;
+  uint32_t _hist_buf_size;
+  uint32_t _glob_sum_buf_size;
+  uint32_t _n;
+
+  ICLTensor *_input;
+  ICLTensor *_values;
+  ICLTensor *_indices;
+
+  cl::Buffer _qs_idx_buf;
+  cl::Buffer _qs_temp_buf;
+  cl::Buffer _hist_buf;
+  cl::Buffer _glob_sum_buf;
+  cl::Buffer _temp_buf;
+  cl::Buffer _first_negative_idx_buf;
+  cl::Buffer _in_key_buf;
+  cl::Buffer _out_key_buf;
+  cl::Buffer _in_ind_buf;
+  cl::Buffer _out_ind_buf;
+
+  cl::Buffer *_p_in_key_buf;
+  cl::Buffer *_p_out_key_buf;
+  cl::Buffer *_p_in_ind_buf;
+  cl::Buffer *_p_out_ind_buf;
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+  CLTopKV2Single _qs_kernel;
+  CLTopKV2Init _init_kernel;
+  CLRadixSortHistogram _hist_kernel;
+  CLRadixSortScanHistogram _scan_hist_kernel;
+  CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel;
+  CLRadixSortPasteHistogram _paste_hist_kernel;
+  CLRadixSortReorder _reorder_kernel;
+  CLTopKV2FindFirstNegative _find_first_negative_kernel;
+  CLTopKV2ReorderNegatives _reorder_negatives_kernel;
+  CLTopKV2Store _store_kernel;
+#endif
+};
+}
+#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
new file mode 100644
index 000000000..340a7bfe9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Function to run the transpose convolution layer.
+ *
+ * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
+ *
+ * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perform a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finally a is a user
+ * specified value where a < stride - 1, that increases the padding top and right of the input
+ * image.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where:
+ *      width_input is the size of the first input dimension.
+ *      height_input is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref
+ * CPPFlipWeightsKernel.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLTransposeConvLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
+ */
+class CLTransposeConvLayer : public IFunction
+{
+public:
+  /** Constructor */
+  CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
+  /** Default move constructor */
+  CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
+  /** Default move assignment operator */
+  CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input          Input tensor. 3 lower dimensions represent a single input,
+   *                               and an optional 4th dimension for batch of inputs.
+   *                               Data types supported: QASYMM8/F16/F32.
+   * @param[in]     weights        The 4d weights with dimensions [width, height, IFM, OFM].
+   *                               Data type supported: Same as @p input.
+   * @param[in]     bias           (Optional) The biases have one dimension. Data type supported:
+   *                               Same as @p input.
+   * @param[out]    output         Output tensor. The output has the same number of dimensions
+   *                               as the @p input.
+   * @param[in]     info           Contains padding and policies to be used in the
+   *                               transpose convolution, this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
+   * @param[in]     weights_info   (Optional) Weights information needed for @ref
+   *                               CLConvolutionLayer, specifies if the weights tensor has been
+   *                               reshaped with @ref CLWeightsReshapeKernel.
+   */
+  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLTransposeConvLayer
+   *
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: QASYMM8/F16/F32.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension. Data type supported:
+   *                            Same as @p input.
+   * @param[in] output          Output tensor info. The output has the same number of dimensions
+   *                            as the @p input.
+   * @param[in] info            Contains padding and policies to be used in the
+   *                            transpose convolution, this is decribed in @ref PadStrideInfo.
+   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
+   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+   *                            specifies if the weights tensor has been reshaped with @ref
+   *                            CLWeightsReshapeKernel.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                         unsigned int innvalid_right, unsigned int invalid_bottom,
+                         const WeightsInfo &weights_info = WeightsInfo());
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  CLMemoryGroup _memory_group;
+  CLTransposeConvLayerUpsample _scale_f;
+  CLConvolutionLayer _conv_f;
+  CPPFlipWeightsKernel _flip_weights;
+  CLTensor _scaled_output;
+  ICLTensor *_original_weights;
+  CLTensor _weights_flipped;
+  bool _is_prepared;
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
new file mode 100644
index 000000000..4ae0e1830
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
+class CLTransposeConvLayerUpsample : public IFunction
+{
+public:
+  /** Default constructor */
+  CLTransposeConvLayerUpsample();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
+  /** Default destructor */
+  virtual ~CLTransposeConvLayerUpsample() = default;
+
+  /** Initialize the function's source, destination, interpolation type and border_mode.
+   *
+   * @param[in, out] input        Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[out]     output       Destination tensor. Data type supported: same as @p input.
+   * @param[in]      inner_border The number of zeros added to right and top edges of the input.
+   * @param[in]      info         Contains padding and policies to be used in the deconvolution.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+                 const PadStrideInfo &info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLTransposeConvLayerUpsample
+   *
+   * @param[in] input        Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in] output       Destination tensor info. Data type supported: same as @p input.
+   * @param[in] inner_border The number of zeros added to right and top edges of the input.
+   * @param[in] info         Contains padding and policies to be used in the deconvolution.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const BorderSize &inner_border, const PadStrideInfo &info);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLTransposeConvLayerUpsampleKernel _upsample;
+  ICLTensor *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
new file mode 100644
index 000000000..8e7e2f937
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref CPPUpsample */
+class CPPUpsampleEx : public ICPPSimpleFunction
+{
+public:
+  /** Configure the upsample CPP kernel
+   *
+   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+   * @param[out] output The output tensor. Data types supported: Same as @p input
+   * @param[in]  info   Padding information
+   */
+  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+};
+}
+#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
new file mode 100644
index 000000000..37bccc52c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
+#define __ARM_COMPUTE_NEFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/NEON/functions/NEArgMinMax.h>
+#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+#include <arm_compute/runtime/NEON/functions/NECast.h>
+#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
+#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
+#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
+#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h
new file mode 100644
index 000000000..604cd93c4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__
+#define __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce min/max operation */
+template <ReductionOperation op> class NEArgMinMaxStatic : public IFunction
+{
+public:
+  /** Constructor */
+  NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  axis           Reduction axis.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, int axis, ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMax
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] axis Reduction axis.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEArgMinMaxLayer _reduction_kernel;
+  Tensor _reduced_out;
+  NEReshapeLayer _reshape;
+};
+
+/** Basic function to run arg max. */
+using NEArgMax = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+/** Basic function to run arg min. */
+using NEArgMin = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
new file mode 100644
index 000000000..2a624656d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+class NEBinaryLogicalOperation : public INESimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and conversion policy.
+   *
+   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8.
+   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
+   * @param[in]      op     Binary Logical Operation to be performed.
+   */
+  void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   * @param[in] op     Binary Logical Operation to be performed.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, BinaryLogicalOperation op);
+};
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and conversion policy.
+   *
+   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8
+   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
+   */
+  void configure(ITensor *input1, ITensor *input2, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output);
+};
+
+/** Basic function to run equal comparison. */
+using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+/** Basic function to run not equal comparison. */
+using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
new file mode 100644
index 000000000..ae2f57f19
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECAST_H__
+#define __ARM_COMPUTE_NECAST_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
+class NECast : public INESimpleFunctionNoBorder
+{
+public:
+  /** Configure the kernel.
+   *
+   * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+   * U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in]  input_subtype  Sub data type of input.
+   */
+  void configure(const ITensor *input, ITensor *output,
+                 SubDataType input_subtype = SubDataType::NONE);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECast
+   *
+   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in] input_subtype  Sub data type of input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         SubDataType input_subtype = SubDataType::NONE);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NECAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
new file mode 100644
index 000000000..90c0751b8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
+class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   * @param[in]  block_shape Block shape value.
+   */
+  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEDepthToSpaceLayerEx.
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   * @param[in] block_shape Block shape x value.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
new file mode 100644
index 000000000..f0c8ecdb5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform negative on an input tensor. */
+class NENegLayer : public INESimpleFunction
+{
+public:
+  /** Initialize the function
+   *
+   * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
+   * @param[out] output Output tensor. Data types supported: same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
+   *
+   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
new file mode 100644
index 000000000..0646f1668
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NEEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class NEEmbeddingLookup : public INESimpleFunctionNoBorder
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   * input. Data types supported: S32.
+   * @return N/A
+   */
+  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+   *
+   * @param[in] input  Source tensor info. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   * @param[in] output Lookups tensor info. Data types supported: S32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
new file mode 100644
index 000000000..42a786821
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
+ * the following kernels:
+ *
+ *  -# @ref NETransposeKernel
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Destination tensor. Data type supported: Same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedHybridLayerReshapeWeights
+   *
+   * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in] output Destination tensor info. Data type supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ * and transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayer : public IFunction
+{
+public:
+  /** Constructor */
+  NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete;
+  /** Default move constructor */
+  NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete;
+  /** Default move assignment operator */
+  NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedHybridLayer
+   *
+   * @param[in]  input   Source tensor info. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+  MemoryGroup _memory_group;
+  NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+  NEQuantizationSymmetricKernel _quant_input_kernel;
+  NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+  NEMultiplyScaleFactorKernel _multiply_scale_kernel;
+  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+  Tensor _reshape_weights_output;
+  Tensor _quantized_input;
+  Tensor _scale_factor;
+  Tensor _gemmlowp_output;
+  const ITensor *_original_weights;
+  bool _are_weights_reshaped;
+  bool _accumulate_biases;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
new file mode 100644
index 000000000..6bd67f322
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
+ * transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ * @note  The difference from NEFullyConnectedLayer is that this class supports weights as input
+ * with performance loss.
+ */
+class NEFullyConnectedLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
+  /** Default move constructor */
+  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
+  /** Default move assignment operator */
+  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedLayerEx
+   *
+   * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+  MemoryGroup _memory_group;
+  NEFlattenLayerKernel _flatten_kernel;
+  NEConvertFullyConnectedWeights _convert_weights;
+  NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
+  NEGEMM _mm_gemm;
+  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+  Tensor _flatten_output;
+  Tensor _gemmlowp_output;
+  Tensor _converted_weights_output;
+  Tensor _reshape_weights_output;
+  const ITensor *_original_weights;
+  bool _are_weights_converted;
+  bool _are_weights_reshaped;
+  bool _is_fc_after_conv;
+  bool _accumulate_biases;
+  bool _is_quantized;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..18cb61bf9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        NEFullyConnectedReshapingLayer.h
+ * @brief       This file contains NEFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+#include <arm_compute/runtime/Tensor.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class NEFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+  enum class KernelType
+  {
+    GENERAL,             //< General FC
+    PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
+  };
+
+public:
+  NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+      : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+        _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] weights The tensor that is filled with weight values
+   * @param[in] biases The tensor that is filled with biase values
+   * @param[in] output The destination tensor
+   * @param[in] needs_reshape Whether it needs to be reshaped or not
+   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+   * @param[in] kernel_type The kernel type for actual FullyConnected layer
+   * @return N/A
+   */
+  void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights,
+                 const arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                 bool needs_reshape, const arm_compute::TensorShape &reshape,
+                 KernelType kernel_type);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+  /**
+   * @brief Prepare the operation
+   * @return N/A
+   */
+  void prepare(void) override;
+
+private:
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  const arm_compute::ITensor *_input;
+  const arm_compute::ITensor *_weights;
+  const arm_compute::ITensor *_biases;
+  arm_compute::ITensor *_output;
+
+  // buffer for reshaping input tensor
+  arm_compute::Tensor _neon_buffer;
+
+private:
+  std::unique_ptr<arm_compute::IFunction> _neon_fc;
+  NEReshapeLayer _neon_reshape;
+  bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
new file mode 100644
index 000000000..414b9f7d9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
+ * NEON kernels if the DOT product instruction is not available:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel
+ *  -# @ref NEGEMMTranspose1xWKernel
+ *  -# @ref NEGEMMLowpMatrixMultiplyKernel
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
+ *  -# @ref NEActivationLayer
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
+ *
+*/
+class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+  /** Default move constructor */
+  NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+  /** Default move assignment operator */
+  NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+  /** Initialise the kernel's inputs, output
+   *
+   * @note GEMM_LOWP:  low precision GEMM kernel
+   *  This kernel performs the following computations:
+   *
+   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+   *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+   *  -# Compute the matrix product of the resulting a * b in int32.
+   *
+   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+   * QASYMM8/QASYMM8_SIGNED otherwise
+   *
+   * @param[in]  a         First input tensor  (Matrix A). Data type supported:
+   * QASYMM8/QASYMM8_SIGNED.
+   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
+   * S32
+   * @param[out] output    Output tensor. Data type supported: Data type supported:
+   * S32/QASYMM8/QASYMM8_SIGNED
+   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+   * and
+   *                       if the reshape of matrix B should be executed only for the first run
+   */
+  void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
+                 const GEMMInfo &gemm_info = GEMMInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGEMMLowpMatrixMultiplyCoreEx
+   *
+   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+   * QASYMM8/QASYMM8_SIGNED otherwise
+   *
+   * @param[in] a         First input tensor info  (Matrix A). Data type supported:
+   * QASYMM8/QASYMM8_SIGNED.
+   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
+   * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type
+   * supported: S32
+   * @param[in] output    Output tensor info. Data type supported: Data type supported:
+   * S32/QASYMM8/QASYMM8_SIGNED
+   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+   * and
+   *                      if the reshape of matrix B should be executed only for the first run
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+
+  // Inherited methods overridden
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEGEMMAssemblyDispatch _asm_glue;
+  std::unique_ptr<INEKernel> _mm_kernel;
+  std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
+  std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
+  NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+  NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+  NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+  NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+  //    NEActivationLayer                             _activation_func;
+
+  Tensor _vector_sum_col;
+  Tensor _vector_sum_row;
+  Tensor _tmp_a;
+  Tensor _tmp_b;
+  Tensor _mm_result_s32;
+  Tensor _signed_a;
+  Tensor _signed_output;
+  const ITensor *_original_b;
+  int32_t _a_offset;
+  int32_t _b_offset;
+
+  bool _run_vector_matrix_multiplication;
+  bool _assembly_path;
+  bool _fused_assembly_path;
+  bool _reshape_b_only_on_first_run;
+  bool _is_prepared;
+  bool _fuse_output_stage;
+  bool _run_activation;
+  bool _flip_signedness;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
new file mode 100644
index 000000000..d95e6a81e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHEREX_H__
+#define __ARM_COMPUTE_NEGATHEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEGatherKernelEx */
+class NEGatherEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
new file mode 100644
index 000000000..69abf0192
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NEHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class NEHashtableLookup : public INESimpleFunctionNoBorder
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input. Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+                 ITensor *hits);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+   *
+   * @param[in]  lookups  Lookups 1D tensor info.
+   *                      Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  output   Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  hits     Hits 1D tensor info. A boolean tensor that indicates whether the lookup
+   * hits (True) or not (False). Data types supported: U8/QASYMM8
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+};
+}
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..521f50d2f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref NEInstanceNormalizationLayerKernelEx
+ */
+class NEInstanceNormalizationLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
+   * Defaults to 1.0
+   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                 float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEInstanceNormalizationLayer.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
+   * to 1.0
+   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEInstanceNormalizationLayerKernelEx _normalization_kernel;
+  bool _is_nchw;
+  NEPermute _permute_input;
+  NEPermute _permute_output;
+  Tensor _permuted_input;
+  Tensor _permuted_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
new file mode 100644
index 000000000..5664c57cb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPRELU_H__
+#define __ARM_COMPUTE_NEPRELU_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEPReLUKernel */
+class NEPReLU : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and output
+   *
+   * @param[in]  input. Data types supported: QASYMM8/F32.
+   * @param[in]  alpha. Data types supported: Same as @p input.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEPRELU_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
new file mode 100644
index 000000000..17c37d806
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
+#define __ARM_COMPUTE_NERNNLAYER_EX_H__
+
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NERNNLayerEx */
+class NERNNLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NERNNLayerEx(const NERNNLayerEx &) = delete;
+  /** Default move constructor */
+  NERNNLayerEx(NERNNLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
+  /** Default move assignment operator */
+  NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
+  /** Initialize the function
+   *
+   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+   * types supported: F16/F32
+   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
+   * multiplies the input. Data types supported: Same as @p input
+   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+   * the current 'state'. Data types supported: Same as @p input
+   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
+   * as @p input
+   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in]     info              Activation layer parameter.
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
+                 const ITensor *bias, ITensor *hidden_state, ITensor *output,
+                 ActivationLayerInfo &info);
+  /** Initialize the function
+   *
+   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+   * types supported: F16/F32
+   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
+   * the input. Data types supported: Same as @p input
+   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+   * current 'state'. Data types supported: Same as @p input
+   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
+   * input
+   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in] info              Activation layer parameter.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                         const ITensorInfo *hidden_state, const ITensorInfo *output,
+                         const ActivationLayerInfo &info);
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEGEMM _gemm_state_f;
+  NEArithmeticAdditionKernel _add_kernel;
+  NEActivationLayerKernel _activation_kernel;
+  NEFullyConnectedLayer _fully_connected_kernel;
+  NECopyKernel _copy_kernel;
+  Tensor _fully_connected_out;
+  Tensor _gemm_output;
+  Tensor _add_output;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
new file mode 100644
index 000000000..7209acf19
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceMeanEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReduceMeanEx
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
+  std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
new file mode 100644
index 000000000..9c558e6a2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceOperation : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  op             Reduce operation to perform.
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output,
+                 ReduceOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReduceOperation
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  op             Reduce operation to perform.
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output, ReduceOperation op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<NEReductionOperationEx> _reduction_kernels;
+  std::vector<Tensor> _reduced_outs;
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
new file mode 100644
index 000000000..c028ea658
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceSum : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<NEReductionOperation> _reduction_kernels;
+  std::vector<Tensor> _reduced_outs;
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
new file mode 100644
index 000000000..7180742df
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
+#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a reduction operation. This function calls the following NEON
+ * kernels:
+ *
+ * -# @ref NEFillBorderKernel
+ * -# @ref NEReductionOperationKernelEx
+ *
+ */
+class NEReductionOperationEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NEReductionOperationEx();
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+   * @param[in]  axis   Dimension along which to reduce.
+   * @param[in]  op     Reduction operation to perform.
+   */
+  void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReductionOperationEx.
+   *
+   * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] axis   Dimension along which to reduce.
+   * @param[in] op     Reduction operation to perform.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+                         ReduceOperation op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  NEReductionOperationKernelEx _reduction_kernel;
+  NEFillBorderKernel _fill_border_kernel;
+  size_t _window_split;
+  int _reduction_axis;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
new file mode 100644
index 000000000..302f9af2e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to spatial divide a tensor. This function calls the following NEON
+ * kernels/functions:
+ *
+ *  -# @ref NEMemsetKernel
+ *  -# @ref NESpaceToBatchLayerKernel
+ */
+class NESpaceToBatchLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NESpaceToBatchLayerEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
+  /** Default destructor */
+  virtual ~NESpaceToBatchLayerEx() = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+   * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   */
+  void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
+                 ITensor *output);
+  /** Set the input and output tensors. (Static block shape and paddings)
+   *
+   * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  block_shape_x Block shape x value.
+   * @param[in]  block_shape_y Block shape y value.
+   * @param[in]  padding_left  The left padding of the output tensor.
+   * @param[in]  padding_right The right padding of the output tensor.
+   * @param[out] output        Tensor output. Data types supported: same as @p input
+   */
+  void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
+                 const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToBatchLayerEx
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
+   * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+                         const ITensorInfo *paddings, const ITensorInfo *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToBatchLayerEx (Static block shape and paddings)
+   *
+   * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] block_shape_x Block shape x value.
+   * @param[in] block_shape_y Block shape y value.
+   * @param[in] padding_left  The left padding of the output tensor.
+   * @param[in] padding_right The right padding of the output tensor.
+   * @param[in] output        Tensor output info. Data types supported: same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
+                         const Size2D &padding_left, const Size2D &padding_right,
+                         const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+  NEMemsetKernel _memset_kernel;                    /**< Memset kernel to run */
+  bool _has_padding;                                /**< Flag to check if the output has padding */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
new file mode 100644
index 000000000..117717b55
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** This function calls the following NEON kernels/functions:
+ *
+ *  -# @ref NESpaceToDepthLayerKernelEx
+ */
+class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   * @param[in]  block_shape Block shape value
+   */
+  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToDepthLayerEx (Static block shape and paddings)
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   * @param[in] block_shape Block shape value
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
new file mode 100644
index 000000000..a50b9ea60
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Function to run the deconvolution layer.
+ *
+ * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
+ * input depending on the stride and pad info and then perfrom a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finaly a is a user
+ * specified value where a < stride - 1 that increases the padding top and right of the input image.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where
+ *      width is the size of the first input dimension.
+ *      height is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Transpose convolution are supposed to be the same as the ones used for
+ * Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref
+ * CPPFlipWeightsKernel.
+ *
+ * This function calls the following NEON kernels/functions:
+ *
+ * -# @ref CPPUpsample
+ * -# @ref NEConvolutionLayer
+ *
+ */
+class NETransposeConvLayer : public IFunction
+{
+public:
+  /** Default constructor */
+  NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NETransposeConvLayer(const NETransposeConvLayer &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
+  /** Allow instances of this class to be moved */
+  NETransposeConvLayer(NETransposeConvLayer &&) = default;
+  /** Allow instances of this class to be moved */
+  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+  /** Default destructor */
+  virtual ~NETransposeConvLayer() = default;
+
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
+   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+   * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+   * supported: Same as @p input.
+   * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
+   * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+   * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
+   * input.
+   * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
+   * decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
+   *
+   */
+  void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right,
+                 unsigned int invalid_bottom);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NETransposeConvLayer
+   *
+   * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
+   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+   * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
+   * supported: Same as @p input.
+   * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
+   * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+   * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
+   * input.
+   * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
+   * decribed in @ref PadStrideInfo.
+   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, const ITensorInfo *output,
+                         const PadStrideInfo &info, unsigned int invalid_right,
+                         unsigned int invalid_bottom);
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEConvolutionLayer _conv_f;
+  CPPUpsampleEx _upsample_f;
+  CPPFlipWeightsKernel _flip_weights;
+  NEPermute _permute_input;
+  NEPermute _permute_weights;
+  NEPermute _permute_output;
+  Tensor _scaled_output;
+  Tensor _weights_flipped;
+  Tensor _permuted_input;
+  Tensor _permuted_weights;
+  Tensor _permuted_output;
+  bool _is_nchw;
+  const ITensor *_original_weights;
+  ITensor *_input;
+  PadStrideInfo _info;
+  bool _is_prepared;
+};
+} // arm_compute
+#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
new file mode 100644
index 000000000..3db0c7e5e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        GenericGather.h
+ * @brief       This file contains GenericGather class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Gather with both CPU and GPU
+ */
+class GenericGather : public arm_compute::IFunction
+{
+public:
+  GenericGather(void)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] indices The indices tensor
+   * @param[in] output The destination tensor
+   * @param[in] axis (Optional) The axis in input to gather indices from
+   * @return N/A
+   */
+  void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                 arm_compute::ITensor *output, int axis = 0);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+
+private:
+  arm_compute::ITensor *_input{nullptr};
+  arm_compute::ITensor *_indices{nullptr};
+  arm_compute::ITensor *_output{nullptr};
+  int _axis{0};
+  arm_compute::CLTensor _cl_permuted;
+
+private:
+  arm_compute::CLPermute _cl_permute;
+  arm_compute::CLGatherEx _cl_gather;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h
new file mode 100644
index 000000000..ab2fdc71d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        GenericReshapeLayer.h
+ * @brief       This file contains GenericReshapeLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Reshape Layer with both CPU and GPU
+ */
+class GenericReshapeLayer : public arm_compute::IFunction
+{
+public:
+  GenericReshapeLayer(void)
+      : _input(nullptr), _output(nullptr), _cl_permuted{}, _neon_permuted{}, _cl_permute{},
+        _cl_reshape{}, _neon_permute{}, _neon_reshape{}
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] output The destination tensor
+   * @return N/A
+   */
+  void configure(const arm_compute::ITensor *input, arm_compute::ITensor *output);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+
+private:
+  const arm_compute::ITensor *_input;
+  arm_compute::ITensor *_output;
+  arm_compute::CLTensor _cl_permuted;
+  arm_compute::Tensor _neon_permuted;
+
+private:
+  arm_compute::CLPermute _cl_permute;
+  arm_compute::CLReshapeLayer _cl_reshape;
+
+  arm_compute::NEPermute _neon_permute;
+  arm_compute::NEReshapeLayer _neon_reshape;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h
new file mode 100644
index 000000000..53736f55f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file utils.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains utils for arm compute library
+ */
+#ifndef __ARM_COMPUTE_MISC_UTILS_H__
+#define __ARM_COMPUTE_MISC_UTILS_H__
+
+#include <string>
+#include <cassert>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/core/Coordinates.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Types.h>
+
+// TODO : It should be extracted to independent module.
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace utils
+{
+
+/**
+ * @brief Check if this runtime runs on GPU or NEON
+ * @return @c true if GPU mode, otherwise @c false
+ */
+bool isGpuMode();
+
+#ifndef CAST_CL
+#define CAST_CL(tensor) static_cast<::arm_compute::CLTensor *>(tensor)
+#endif
+
+#ifndef CAST_NE
+#define CAST_NE(tensor) static_cast<::arm_compute::Tensor *>(tensor)
+#endif
+
+/**
+* @brief      Generate arm compute permutation vector from runtime permutation vector
+* @param[in]  rank                 Rank number supported upto 4
+* @param[in]  runtime_pv           Integer array for runtime permutation vector
+* @return     Permutation vector of arm compute
+*/
+arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank,
+                                                              const int32_t *runtime_pv);
+
+/**
+ * @brief       Set value to arm compute tensor with casting
+ * @param[in]   value Value to set
+ * @param[out]  to    Target tensor of arm compute
+ * @param[in]   id    Position of element
+ * @return      N/A
+ */
+template <typename FromT>
+void copyCast(const FromT value, arm_compute::ITensor *to, const arm_compute::Coordinates &id)
+{
+  switch (to->info()->data_type())
+  {
+    case arm_compute::DataType::F32:
+    {
+      *reinterpret_cast<float *>(to->ptr_to_element(id)) = static_cast<float>(value);
+      break;
+    }
+    case arm_compute::DataType::S32:
+    {
+      *reinterpret_cast<int32_t *>(to->ptr_to_element(id)) = static_cast<int32_t>(value);
+      break;
+    }
+    case arm_compute::DataType::U32:
+    {
+      *reinterpret_cast<uint32_t *>(to->ptr_to_element(id)) = static_cast<uint32_t>(value);
+      break;
+    }
+    case arm_compute::DataType::QASYMM8:
+    {
+      float realValue = static_cast<float>(value);
+      // NOTE We haven't known the policy of rounding for quantization.
+      //      So this is set to a temporary value.
+      *(to->ptr_to_element(id)) =
+          to->info()->quantization_info().quantize(realValue, arm_compute::RoundingPolicy::TO_ZERO);
+      break;
+    }
+    default:
+      throw std::runtime_error("Not supported, yet");
+      break;
+  }
+}
+
+} // namespace utils
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_UTILS_H__
diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py
new file mode 100644
index 000000000..b3e252892
--- /dev/null
+++ b/compute/ARMComputeEx/resolve_includes.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+import os.path
+import re
+import subprocess
+import glob
+
+
+def resolve_includes(target, source):
+    # File collection
+    FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
+
+    # Include pattern
+    pattern = re.compile("#include \"(.*)\"")
+
+    # Get file contents
+    files = []
+    for i in range(len(source)):
+        src = source[i]
+        dst = target[i]
+        f = open(src)
+        cts = f.read()
+        f.close()
+        contents = cts.splitlines()
+        entry = FileEntry(target_name=dst, file_contents=contents)
+        files.append((os.path.basename(src), entry))
+
+    # Create dictionary of tupled list
+    files_dict = dict(files)
+
+    # Check for includes (can only be files in the same folder)
+    final_files = []
+    for file in files:
+        done = False
+        tmp_file = file[1].file_contents
+        print(file[1].target_name)
+        while not done:
+            file_count = 0
+            updated_file = []
+            for line in tmp_file:
+                found = pattern.search(line)
+                if found:
+                    include_file = found.group(1)
+                    data = files_dict[include_file].file_contents
+                    updated_file.extend(data)
+                else:
+                    updated_file.append(line)
+                    file_count += 1
+
+            # Check if all include are replaced.
+            if file_count == len(tmp_file):
+                done = True
+
+            # Update temp file
+            tmp_file = updated_file
+
+        # Append and prepend string literal identifiers and add expanded file to final list
+        tmp_file.insert(0, "R\"(\n")
+        tmp_file.append("\n)\"")
+        entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
+        final_files.append((file[0], entry))
+
+    # Write output files
+    for file in final_files:
+        with open(file[1].target_name, 'w+') as out_file:
+            out_file.write("\n".join(file[1].file_contents))
+
+
+# Generate embed files
+cl_files = glob.glob('src/core/CL/cl_kernels/*.cl')
+cl_files += glob.glob('src/core/CL/cl_kernels/*.h')
+
+# DEBUG: print cl files
+print("cl_files:")
+print(cl_files)
+
+embed_files = [f + "embed" for f in cl_files]
+print("embed_files:")
+print(embed_files)
+
+resolve_includes(embed_files, cl_files)
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 000000000..7d4760600
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+    // ARMComputeEx kernels
+    {"arg_op", "arg_operation.cl"},
+    {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+    {"binary_logical_op", "binary_logical_op.cl"},
+    {"cast", "cast.cl"},
+    {"cast_qasymm_in", "cast.cl"},
+    {"cast_qasymm_out", "cast.cl"},
+    {"comparison_op", "comparison_op.cl"},
+    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
+    {"depth_to_space_nchw", "depth_to_space.cl"},
+    {"depth_to_space_nhwc", "depth_to_space.cl"},
+    {"embedding_lookup", "embedding_lookup.cl"},
+    {"gather_ex", "gather_ex.cl"},
+    {"gather_ex_1d", "gather_ex.cl"},
+    {"gather_ex_1d_out", "gather_ex.cl"},
+    {"hashtable_lookup", "hashtable_lookup.cl"},
+    {"instance_normalization_ex", "instance_normalization_ex.cl"},
+    {"neg_tensor", "neg_tensor.cl"},
+    {"permute_generic", "permute_ex.cl"},
+    {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+    {"prelu", "prelu.cl"},
+    {"prelu_qasymm8", "prelu_quantized.cl"},
+    {"reduce_min_max", "reduce_operation.cl"},
+    {"reduce_sum_mean", "reduce_operation.cl"},
+    {"topkv2_init", "topkv2.cl"},
+    {"topkv2_find_first_negative", "topkv2.cl"},
+    {"topkv2_reorder_negatives", "topkv2.cl"},
+    {"topkv2_store", "topkv2.cl"},
+    {"radixsort_histogram", "topkv2_radixsort.cl"},
+    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+    {"radixsort_reorder", "topkv2_radixsort.cl"},
+    {"topkv2_quicksort", "topkv2_quicksort.cl"},
+    {"space_to_batch_4d_nchw", "space_to_batch.cl"},
+    {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
+    {"space_to_depth_nchw", "space_to_depth.cl"},
+    {"space_to_depth_nhwc", "space_to_depth.cl"},
+};
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+    {
+        "arg_operation.cl",
+#include "./cl_kernels/arg_operation.clembed"
+    },
+    {
+        "cast.cl",
+#include "./cl_kernels/cast.clembed"
+    },
+    {
+        "embedding_lookup.cl",
+#include "./cl_kernels/embedding_lookup.clembed"
+    },
+    {
+        "depth_to_space.cl",
+#include "./cl_kernels/depth_to_space.clembed"
+    },
+    {
+        "gather_ex.cl",
+#include "./cl_kernels/gather_ex.clembed"
+    },
+    {
+        "hashtable_lookup.cl",
+#include "./cl_kernels/hashtable_lookup.clembed"
+    },
+    {
+        "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+    },
+    {
+        "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+    },
+    {
+        "instance_normalization_ex.cl",
+#include "./cl_kernels/instance_normalization_ex.clembed"
+    },
+    {
+        "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+    },
+    {
+        "neg_tensor.cl",
+#include "./cl_kernels/neg_tensor.clembed"
+    },
+    {
+        "prelu.cl",
+#include "./cl_kernels/prelu.clembed"
+    },
+    {
+        "prelu_quantized.cl",
+#include "./cl_kernels/prelu_quantized.clembed"
+    },
+    {
+        "reduce_operation.cl",
+#include "./cl_kernels/reduce_operation.clembed"
+    },
+    {
+        "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
+    },
+    {
+        "space_to_depth.cl",
+#include "./cl_kernels/space_to_depth.clembed"
+    },
+    {
+        "topkv2.cl",
+#include "./cl_kernels/topkv2.clembed"
+    },
+    {
+        "topkv2_radixsort.cl",
+#include "./cl_kernels/topkv2_radixsort.clembed"
+    },
+    {
+        "topkv2_quicksort.cl",
+#include "./cl_kernels/topkv2_quicksort.clembed"
+    },
+
+#endif /* EMBEDDED_KERNELS */
+};
+
+CLKernelLibraryEx::CLKernelLibraryEx()
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+  opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
+                         // CLKernelLibraryEx is built
+}
+
+CLKernelLibraryEx &CLKernelLibraryEx::get()
+{
+  static CLKernelLibraryEx _kernel_library;
+  return _kernel_library;
+}
+
+Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
+                                        const StringSet &build_options_set) const
+{
+  // Find which program contains the kernel
+  auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+  if (_kernel_program_map.end() == kernel_program_it)
+  {
+    ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+  }
+  std::string concat_str;
+
+  if (fp16_supported())
+  {
+    concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+  }
+
+  if (get_cl_version(_device) == CLVersion::CL20)
+  {
+    concat_str += " -cl-std=CL2.0 ";
+  }
+  else if (arm_non_uniform_workgroup_supported(_device))
+  {
+    concat_str += " -cl-arm-non-uniform-work-group-size ";
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+  }
+
+  // Check if the program has been built before with same build options.
+  const std::string program_name = kernel_program_it->second;
+  const std::string build_options = stringify_set(build_options_set) + concat_str;
+
+  const std::string built_program_name = program_name + "_" + build_options;
+  auto built_program_it = _built_programs_map.find(built_program_name);
+
+  cl::Program cl_program;
+
+  if (_built_programs_map.end() != built_program_it)
+  {
+    // If program has been built, retrieve to create kernel from it
+    cl_program = built_program_it->second;
+  }
+  else
+  {
+    // Get program
+    Program program = load_program(program_name);
+
+    // Build program
+    cl_program = program.build(build_options);
+
+    // Add built program to internal map
+    _built_programs_map.emplace(built_program_name, cl_program);
+  }
+
+  // Create and return kernel
+  return Kernel(kernel_name, cl_program);
+}
+
+void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
+                                          cl::Program program)
+{
+  _built_programs_map.emplace(built_program_name, program);
+}
+
+bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
+
+bool CLKernelLibraryEx::int64_base_atomics_supported() const
+{
+  return device_supports_extension(_device, "cl_khr_int64_base_atomics");
+}
+
+const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
+{
+  const auto program_it = _programs_map.find(program_name);
+
+  if (program_it != _programs_map.end())
+  {
+    return program_it->second;
+  }
+
+  Program program;
+
+#ifdef EMBEDDED_KERNELS
+  const auto program_source_it = _program_source_map.find(program_name);
+
+  if (_program_source_map.end() == program_source_it)
+  {
+    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+  }
+
+  program = Program(_context, program_name, program_source_it->second);
+#else  /* EMBEDDED_KERNELS */
+  // Check for binary
+  std::string source_name = _kernel_path + program_name;
+  std::string binary_name = source_name + "bin";
+
+  if (std::ifstream(binary_name).is_open())
+  {
+    const std::string program_binary = read_file(binary_name, true);
+    program = Program(_context, _device, program_name,
+                      std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+  }
+  else if (std::ifstream(source_name).is_open())
+  {
+    program = Program(_context, program_name, read_file(source_name, false));
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+  }
+#endif /* EMBEDDED_KERNELS */
+
+  // Insert program to program map
+  const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+  return new_program.first->second;
+}
+
+std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
+{
+  std::string concat_set;
+
+#ifndef EMBEDDED_KERNELS
+  concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+  // Concatenate set
+  for (const auto &el : s)
+  {
+    concat_set += " " + el;
+  }
+
+  return concat_set;
+}
+
+std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
+{
+  const auto program_source_it = _program_source_map.find(program_name);
+
+  if (program_source_it == _program_source_map.end())
+  {
+    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+  }
+
+  return program_source_it->second;
+}
+
+size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
+{
+  size_t result;
+
+  size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+  ARM_COMPUTE_ERROR_ON_MSG(
+      err != 0,
+      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+  ARM_COMPUTE_UNUSED(err);
+
+  return result;
+}
+
+cl::NDRange CLKernelLibraryEx::default_ndrange() const
+{
+  //    GPUTarget   _target = get_target_from_device(_device);
+  cl::Device device = cl::Device::getDefault();
+  GPUTarget _target = get_target_from_device(device);
+  cl::NDRange default_range;
+
+  switch (_target)
+  {
+    case GPUTarget::MIDGARD:
+    case GPUTarget::T600:
+    case GPUTarget::T700:
+    case GPUTarget::T800:
+      default_range = cl::NDRange(128u, 1);
+      break;
+    default:
+      default_range = cl::NullRange;
+  }
+
+  return default_range;
+}
+
+std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
new file mode 100644
index 000000000..2a6dfc91f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform arg_max/arg_min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
+ *       e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ * types:
+ *                                                  U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
+ *                                                  in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension
+ *                                                  (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image.
+ *                                                  Supported data types: U32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+
+__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
+                     const int dim)
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
+
+  DATA_TYPE value =
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  DATA_TYPE tval = value;
+  int idx = 0;
+  for (int i = 1; i < dim; ++i)
+  {
+    indices[axis] = i;
+
+#if OP_CODE == 1 // ArgMax
+    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+#elif OP_CODE == 2 // ArgMin
+    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+#else
+    return;
+
+#endif
+
+    if (tval != value)
+    {
+      idx = indices[axis];
+      tval = value;
+    }
+  }
+
+  *((__global uint *)out.ptr) = idx;
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
new file mode 100644
index 000000000..77e239f55
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
+ *  QASYMM8
+ *
+ * The following computations will be performed:
+ *
+ *  -# Add offset terms to inputs
+    -# Get scaled value of two inputs
+ *  -# Add inputs
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The number of bits to shift left of input tensors must be passed at compile time using
+ *            -DLEFT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
+ *            must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
+ -DIN2_OFFSET,
+ *            -RIN2_MULT_INT and -DIN2_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ -DRESULT_SHIFT
+ *
+ * @attention The input and output data_types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
+ *            using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+ *            e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+ * @attention The inputs and output scale offset need to be passed at compile time using
+ *            -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+ *            e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
+ *            wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor.
+ *                                               Supported data types: QASYMM8
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension
+ *                                               (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension
+ *                                               (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension
+ *                                               (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source
+ *                                               tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types:
+ *                                               QASYMM8
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension
+ *                                               (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension
+ *                                               (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension
+ *                                               (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source
+ *                                               tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor.
+ *                                               Supported data types: QASYMM8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension
+ *                                               (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension
+ *                                               (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension
+ *                                               (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ *                                               tensor
+ */
+__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+                                     TENSOR3D_DECLARATION(out))
+{
+  // Get pixels pointer
+  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+  // Load data
+  VEC_DATA_TYPE(int, 16)
+  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+  VEC_DATA_TYPE(int, 16)
+  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+  // Get scaled value of two inputs
+  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+
+  VEC_DATA_TYPE(int, 16)
+  left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+  VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+  VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+
+  VEC_DATA_TYPE(int, 16)
+  scaled_in1_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+  VEC_DATA_TYPE(int, 16)
+  scaled_in2_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+
+  // Add inputs and multiply with a multiplier smaller than 1
+  VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+  VEC_DATA_TYPE(int, 16)
+  out_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+  // TODO: Apply min-max BOUND to support fuse with relu.
+  /*
+  #if defined(MIN_BOUND)
+      res = max(res, (uchar16)MIN_BOUND);
+  #endif // defined(MIN_BOUND)
+  #if defined(MAX_BOUND)
+      res = min(res, (uchar16)MAX_BOUND);
+  #endif // defined(MAX_BOUND)
+  */
+
+  // Store result
+  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
new file mode 100644
index 000000000..8c875516d
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OP_CODE) && defined(DATA_TYPE)
+/** returns truth value of the two input tensors for BINARY LOGICAL OP.
+ *  where BINARY LOGICAL OP can be AND, OR.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size.
+ *            e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input2_ptr                            Pointer to the source tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor.
+ *                                                   Supported data types: QASYMM8
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ */
+__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2),
+                                TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+  Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if OP_CODE == 1 // LOGICAL AND
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
+               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+   0, (__global DATA_TYPE *)output.ptr);
+
+#elif OP_CODE == 2 // LOGICAL OR
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
+               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+           VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+   0, (__global DATA_TYPE *)output.ptr);
+
+#else // OP NOT SUPPORTED
+  return
+
+#endif
+}
+#endif // if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..2342fda9f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef SCALE
+#define SCALE 1.0f
+#endif
+#ifndef OFFSET
+#define OFFSET 0
+#endif
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+/** Perform a cast operation on an input tensor.
+ *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention -DBOOL_INPUT : Whether type of input is bool.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+           VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+   0, (__global DATA_TYPE_OUT *)output.ptr);
+  VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+  res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+                VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+#if defined(BOOL_INPUT)
+  VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
+  VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
+  res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+#endif // defined(BOOL_INPUT)
+
+  VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+/** Perform a cast operation on an QASYMM8 input tensor.
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of input should be given as a preprocessor argument using
+ *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+
+  VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+  VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+   (__global DATA_TYPE_OUT *)output.ptr);
+}
+
+/** Perform a cast operation on an QASYMM8 output tensor.
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+ *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of output should be given as a preprocessor argument using
+ *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                 bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+
+  VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+  VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+   (__global DATA_TYPE_OUT *)output.ptr);
+}
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
new file mode 100644
index 000000000..e005322f7
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+ *            using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+
+  int out_index[4] = {0};
+  int in_index[4] = {0};
+
+  out_index[0] = get_global_id(0);         // W
+  out_index[1] = get_global_id(1);         // H
+  out_index[2] = get_global_id(2) % Z_OUT; // C
+  out_index[3] = get_global_id(2) / Z_OUT; // B
+
+  in_index[0] = out_index[0] / BLOCK_SIZE;
+  in_index[1] = out_index[1] / BLOCK_SIZE;
+  in_index[2] = out_index[2] +
+                ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+  in_index[3] = out_index[3];
+
+  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+/** Perform space to depth rearrangement of tensor (NHWC)
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+ *            using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+
+  int out_index[4] = {0};
+  int in_index[4] = {0};
+
+  out_index[0] = get_global_id(0);         // C
+  out_index[1] = get_global_id(1);         // W
+  out_index[2] = get_global_id(2) % Z_OUT; // H
+  out_index[3] = get_global_id(2) / Z_OUT; // B
+
+  in_index[0] = out_index[0] +
+                ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+  in_index[1] = out_index[1] / BLOCK_SIZE;
+  in_index[2] = out_index[2] / BLOCK_SIZE;
+  in_index[3] = out_index[3];
+
+  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
new file mode 100644
index 000000000..dd8cb6d93
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
+ *                                                   types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
+ *                                                   bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ *                                                   vector
+ */
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+  // lookup ids for based on the tensor dimensions
+  int lup_id[4] = {0};
+
+  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+                              : get_global_id(0);
+  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+                              : get_global_id(1);
+  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+                              : get_global_id(2) % DEPTH_OUT;
+  lup_id[3] = (NUM_DIMS == 4)
+                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                  : get_global_id(2) / DEPTH_OUT;
+
+  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+   (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
new file mode 100644
index 000000000..09f776156
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ *            -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W
+ *                                                   processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data
+ *                                                   types: S32
+ * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z
+ *                                                   processed per workitem(in  bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the
+ *                                                   destination tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination
+ *                                                   tensor
+ */
+__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
+                        TENSOR4D_DECLARATION(output))
+{
+  const int px = get_global_id(0);
+  const int py = get_global_id(1);
+  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+  const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+  const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif INDICES_DIM == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+#elif INDICES_DIM == 3
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+  __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+#endif
+#elif AXIS == 1
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif INDICES_DIM == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+#elif INDICES_DIM == 3
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+#endif
+#elif AXIS == 2
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif INDICES_DIM == 2
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+#endif
+#elif AXIS == 3
+#if INDICES_DIM == 1
+  const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
+  __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif
+#endif // AXIS
+
+  *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
new file mode 100644
index 000000000..73f29e3e5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using
+ *            -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
+ *                                                   types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data
+ *                                                   types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in
+ *                                                   bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups
+ *                                                   vector
+ */
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+  Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+  int lup_id[4] = {0};
+
+  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
+                              : get_global_id(0);
+  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
+                              : get_global_id(1);
+  lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
+                              : get_global_id(2) % DEPTH_OUT;
+  lup_id[3] = (NUM_DIMS == 4)
+                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                  : get_global_id(2) / DEPTH_OUT;
+
+  if (lup_id[NUM_DIMS - 1] < 0)
+  {
+    VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+    return;
+  }
+
+  in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
+            lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+  VSTORE(VEC_SIZE)
+  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0,
+   (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 000000000..0e123ae0a
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+    defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+
+#define EXPAND(x) x
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)                                        \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
+      uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                               \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                            \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
+      uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                            \
+  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
+      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
+      uint name##_step_w, uint name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
+  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                             name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                          \
+  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                            name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)                                                     \
+  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                            name##_stride_y, 0)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+                                          name##_stride_x, name##_step_x, name##_stride_y,  \
+                                          name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                             \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes,        \
+                                          name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
+                                          name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
+  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
+                                          name##_stride_x, name##_step_x, name##_stride_y,  \
+                                          name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                          \
+  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                               name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                  \
+  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               0, name##_stride_y, 0, name##_stride_z, 0)
+
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                \
+  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                               name##_step_z, name##_stride_w, name##_step_w, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                        \
+  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                               0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0,     \
+                               mod_size)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
+  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+  int stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+                                         uint stride_x, uint step_x)
+{
+  Vector vector = {
+      .ptr = ptr,
+      .offset_first_element_in_bytes = offset_first_element_in_bytes,
+      .stride_x = stride_x,
+  };
+  vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+  return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's
+ * data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ *
+ * @return An image object
+ */
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+                                       uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+  Image img = {.ptr = ptr,
+               .offset_first_element_in_bytes = offset_first_element_in_bytes,
+               .stride_x = stride_x,
+               .stride_y = stride_y};
+  img.ptr +=
+      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+  return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint offset_first_element_in_bytes,
+                                                     uint stride_x, uint step_x, uint stride_y,
+                                                     uint step_y, uint stride_z, uint step_z)
+{
+  Image img = {.ptr = ptr,
+               .offset_first_element_in_bytes = offset_first_element_in_bytes,
+               .stride_x = stride_x,
+               .stride_y = stride_y};
+  img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
+             get_global_id(1) * step_y + get_global_id(2) * step_z;
+  return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
+ * workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per
+ * workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per
+ * workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per
+ * workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint offset_first_element_in_bytes, uint stride_x,
+                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
+                                             uint step_z)
+{
+  Tensor3D tensor = {.ptr = ptr,
+                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                     .stride_x = stride_x,
+                     .stride_y = stride_y,
+                     .stride_z = stride_z};
+  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+                get_global_id(1) * step_y + get_global_id(2) * step_z;
+  return tensor;
+}
+
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint offset_first_element_in_bytes, uint stride_x,
+                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
+                                             uint step_z, uint stride_w, uint step_w, uint mod_size)
+{
+  Tensor4D tensor = {.ptr = ptr,
+                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                     .stride_x = stride_x,
+                     .stride_y = stride_y,
+                     .stride_z = stride_z,
+                     .stride_w = stride_w};
+
+  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
+                get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
+                (get_global_id(2) / mod_size) * step_w;
+  return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+inline __global const uchar *vector_offset(const Vector *vec, int x)
+{
+  return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+  return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+         w * tensor->stride_w;
+}
+
+#endif // _HELPER_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
new file mode 100644
index 000000000..c39138caa
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+#include "helpers.h"
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
+  inline VEC_DATA_TYPE(int, size)                                                    \
+      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                  \
+    VEC_DATA_TYPE(int, size)                                                         \
+    mask = (1 << exponent) - 1;                                                      \
+    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
+    const VEC_DATA_TYPE(int, size) one = 1;                                          \
+    VEC_DATA_TYPE(int, size)                                                         \
+    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
+  }
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size)                                              \
+      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                            \
+    VEC_DATA_TYPE(int, size)                                                   \
+    overflow = a == b && a == INT_MIN;                                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    a_64 = convert_long##size(a);                                              \
+    VEC_DATA_TYPE(long, size)                                                  \
+    b_64 = convert_long##size(b);                                              \
+    VEC_DATA_TYPE(long, size)                                                  \
+    ab_64 = a_64 * b_64;                                                       \
+    /* COMPMID-907 */                                                          \
+    VEC_DATA_TYPE(int, size)                                                   \
+    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
+    return select(ab_x2_high32, INT_MAX, overflow);                            \
+  }
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
+  inline VEC_DATA_TYPE(int, size)                                                                  \
+      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+                                                                              a)                   \
+  {                                                                                                \
+    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
+    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
+    const int k_fractional_bits = 31;                                                              \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x = a + (1 << (k_fractional_bits - 3));                                                        \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x2 = ASYMM_MULT(x, x, size);                                                                   \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x3 = ASYMM_MULT(x2, x, size);                                                                  \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
+        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
+    VEC_DATA_TYPE(int, size)                                                                       \
+    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
+        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
+    return constant_term +                                                                         \
+           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
+  }
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
+ * bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                         \
+  inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask,  \
+                                                                VEC_DATA_TYPE(int, size) then_val, \
+                                                                VEC_DATA_TYPE(int, size) else_val) \
+  {                                                                                                \
+    return (if_mask & then_val) ^ (~if_mask & else_val);                                           \
+  }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                                    \
+    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                      \
+    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                      \
+    return select(all_zeros, all_ones, a == 0);                                        \
+  }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                                        \
+    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                          \
+    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                          \
+    return select(all_zeros, all_ones, a != 0);                                            \
+  }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
+  inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
+      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
+      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
+  {                                                                                            \
+    if (k_integer_bits > exponent)                                                             \
+    {                                                                                          \
+      const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
+      return ASYMM_SELECT_USING_MASK(                                                          \
+          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
+          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
+    }                                                                                          \
+                                                                                               \
+    return result;                                                                             \
+  }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
+  {                                                                                               \
+    const int k_fractional_bits = 31 - k_integer_bits;                                            \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    k_one_quarter = 1 << (k_fractional_bits - 2);                                                 \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    mask = k_one_quarter - 1;                                                                     \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                 \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
+        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
+    VEC_DATA_TYPE(int, size)                                                                      \
+    remainder = a_mod_quarter_minus_one_quarter - a;                                              \
+                                                                                                  \
+    result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits,        \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits,        \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits,         \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits,         \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
+                                remainder, size);                                                 \
+    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
+                                size);                                                            \
+    result =                                                                                      \
+        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
+                                                                                                  \
+    if (k_integer_bits > 5)                                                                       \
+    {                                                                                             \
+      const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                     \
+      result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
+    }                                                                                             \
+                                                                                                  \
+    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                              \
+    return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);            \
+  }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
+  inline VEC_DATA_TYPE(int, size)                                                            \
+      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                          \
+    if (exponent < 0)                                                                        \
+    {                                                                                        \
+      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
+    }                                                                                        \
+                                                                                             \
+    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
+    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
+    int threshold = ((1 << (31 - exponent)) - 1);                                            \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
+    VEC_DATA_TYPE(int, size)                                                                 \
+    result = x << exponent;                                                                  \
+    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
+    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
+    return result;                                                                           \
+  }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
+  inline VEC_DATA_TYPE(int, size)                                                           \
+      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                                         \
+    VEC_DATA_TYPE(long, size)                                                               \
+    a64 = convert_long##size(a);                                                            \
+    VEC_DATA_TYPE(long, size)                                                               \
+    b64 = convert_long##size(b);                                                            \
+    VEC_DATA_TYPE(long, size)                                                               \
+    sum = a64 + b64;                                                                        \
+    const VEC_DATA_TYPE(long, size) one = 1;                                                \
+    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
+    VEC_DATA_TYPE(long, size)                                                               \
+    sign = select(minus_one, one, sum >= 0);                                                \
+    return convert_int##size((sum + sign) / 2);                                             \
+  }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
+  inline VEC_DATA_TYPE(int, size)                                              \
+      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+  {                                                                            \
+    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
+    const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
+    VEC_DATA_TYPE(int, size)                                                   \
+    half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);               \
+    const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810;                 \
+    const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;            \
+    VEC_DATA_TYPE(int, size)                                                   \
+    x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
+    for (int i = 0; i < 3; i++)                                                \
+    {                                                                          \
+      VEC_DATA_TYPE(int, size)                                                 \
+      half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);        \
+      VEC_DATA_TYPE(int, size)                                                 \
+      one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;  \
+      VEC_DATA_TYPE(int, size)                                                 \
+      tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);           \
+      x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);            \
+    }                                                                          \
+    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                 \
+  }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value
+ * accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                  \
+  inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value,             \
+                                                      int src_integer_bits, int dst_integer_bits) \
+  {                                                                                               \
+    int exponent = src_integer_bits - dst_integer_bits;                                           \
+    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
+  }
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+  asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+  ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+  asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+  asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+                           remainder, size)                                                    \
+  exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
+                           remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
+  asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
+  asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+  asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+  asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+\ No newline at end of file
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
new file mode 100644
index 000000000..1d96150f8
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
+    defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and
+ * standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g.
+ * -DDATA_TYPE=float
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with
+ * -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value,
+ * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
+ * data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor.
+ * Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the destination tensor
+ * @param[in]  gamma_ptr                            (Optional) Pointer to the gamma tensor.
+ * Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       (Optional) Stride of the gamma tensor in X
+ * dimension (in bytes)
+ * @param[in]  gamma_step_x                         (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  (Optional) The offset of the first element in
+ * the gamma tensor
+ * @param[in]  beta_ptr                             (Optional) Pointer to the beta tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        (Optional) Stride of the beta tensor in X
+ * dimension (in bytes)
+ * @param[in]  beta_step_x                          (Optional) output_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the beta tensor
+ */
+__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
+#ifndef IN_PLACE
+                                        TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+#ifdef GAMMA
+                                            ,
+                                        VECTOR_DECLARATION(gamma)
+#endif // GAMMA
+#ifdef BETA
+                                            ,
+                                        VECTOR_DECLARATION(beta)
+#endif // BETA
+                                            )
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+#ifndef IN_PLACE
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
+  float sum = 0.f;
+  float sum_sq = 0.f;
+
+#if defined(NHWC)
+
+  const int ch = get_global_id(0);    // Current channel
+  const int batch = get_global_id(2); // Current batch
+  const int elements_plane = DIM_Y * DIM_Z;
+
+  for (int i_w = 0; i_w < DIM_Y; ++i_w)
+  {
+    for (int i_h = 0; i_h < DIM_Z; ++i_h)
+    {
+      float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+      sum += data;
+      sum_sq += data * data;
+    }
+  }
+
+#else // !defined(NHWC)
+  const int ch = get_global_id(2) % DIM_Z;    // Current channel
+  const int batch = get_global_id(2) / DIM_Z; // Current batch
+  const int elements_plane = DIM_X * DIM_Y;
+
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  part_sum = 0.f;
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  part_sum_sq = 0.f;
+  // Calculate partial sum
+  for (int y = 0; y < DIM_Y; ++y)
+  {
+    int x = 0;
+    for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+    {
+      // Load data
+      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+      data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+      part_sum += data;
+      part_sum_sq += data * data;
+    }
+    // Left-overs loop
+    for (; x < DIM_X; ++x)
+    {
+      DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+      part_sum.s0 += data;
+      part_sum_sq.s0 += data * data;
+    }
+  }
+// Perform reduction
+#if VEC_SIZE > 8
+  part_sum.s01234567 += part_sum.s89abcdef;
+  part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+  part_sum.s0123 += part_sum.s4567;
+  part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+  part_sum.s01 += part_sum.s23;
+  part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+  part_sum.s0 += part_sum.s1;
+  part_sum_sq.s0 += part_sum_sq.s1;
+
+  sum = (float)part_sum.s0;
+  sum_sq = (float)part_sum_sq.s0;
+
+#endif // defined(NHWC)
+
+  const float mean_float = (sum / elements_plane);
+  const DATA_TYPE mean = (DATA_TYPE)mean_float;
+  const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float);
+#if defined(GAMMA)
+  const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON);
+  const DATA_TYPE multip = (DATA_TYPE)multip_float;
+#else  // !defined(GAMMA)
+  const DATA_TYPE multip = (DATA_TYPE)0;
+#endif // defined(GAMMA)
+#if defined(BETA)
+  const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch);
+#else  // !defined(BETA)
+  const DATA_TYPE beta = 0;
+#endif // defined(BETA)
+
+#if defined(NHWC)
+
+  for (int i_w = 0; i_w < DIM_Y; ++i_w)
+  {
+    for (int i_h = 0; i_h < DIM_Z; ++i_h)
+    {
+      __global DATA_TYPE *input_address =
+          (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+      __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+      __global DATA_TYPE *output_address =
+          (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+      *(output_address) = (*(input_address)-mean) * multip + beta;
+    }
+  }
+
+#else // !defined(NHWC)
+  for (int y = 0; y < DIM_Y; ++y)
+  {
+    int x = 0;
+    for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+    {
+      __global DATA_TYPE *input_address =
+          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+      __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+      __global DATA_TYPE *output_address =
+          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+      data = VLOAD(VEC_SIZE)(0, input_address);
+
+      VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+      res = (data - mean) * multip + beta;
+      VSTORE(VEC_SIZE)
+      (res, 0, output_address);
+    }
+    // Left-overs loop
+    for (; x < DIM_X; ++x)
+    {
+      __global DATA_TYPE *input_address =
+          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+      __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+      __global DATA_TYPE *output_address =
+          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+      *(output_address) = (*(input_address)-mean) * multip + beta;
+    }
+  }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
+          defined(DIM_Y) && defined(DIM_Z) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
new file mode 100644
index 000000000..4aa7883c3
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Performs a negation of input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types:
+ *                                               S16/S32/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed
+ *                                               per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ *                                               types: same as @p input_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination
+ * image
+ *
+ */
+__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VSTORE(VEC_SIZE)
+  (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
new file mode 100644
index 000000000..2074d3ceb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers_asymm.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of
+ *  GEMMLowp to QASYMM8
+ *
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to inputs
+ *  -# Multiply inputs
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using
+ *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+ * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and
+ *            -DIN2_OFFSET
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+ *            -DRESULT_SHIFT
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types:
+ *                                               U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types:
+ *                                               U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data
+ *                                               types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in
+ *                                              bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in
+ *                                               bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed
+ *                                               per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+ *                                               image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+                                    TENSOR3D_DECLARATION(out), const float scale)
+{
+  // Get pixels pointer
+  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+  // Load data
+  VEC_DATA_TYPE(int, 16)
+  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+  VEC_DATA_TYPE(int, 16)
+  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+
+  // Perform multiplication of two inputs
+  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+  VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
+
+  // Multiply with a multiplier smaller than 1
+  out_val =
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+
+  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+
+  // TODO: Apply min-max BOUND to support fuse with relu.
+  /*
+  #if defined(MIN_BOUND)
+      res = max(res, (uchar16)MIN_BOUND);
+  #endif // defined(MIN_BOUND)
+  #if defined(MAX_BOUND)
+      res = min(res, (uchar16)MAX_BOUND);
+  #endif // defined(MAX_BOUND)
+  */
+
+  // Store result
+  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
new file mode 100644
index 000000000..62a8901f6
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
+ *                                                   types : F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   image
+ * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
+ *                                                   types : F16/F32
+ * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   image
+ *
+ * @param[out] output_ptr                            Pointer to the destination image. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination image
+ */
+__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+                    TENSOR3D_DECLARATION(output))
+{
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VSTORE(VEC_SIZE)
+  (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
+       ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
+             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
+       : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+   0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
new file mode 100644
index 000000000..5e0abd585
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
+    defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define SELECT_TYPE VEC_INT
+
+/** Returns result of prelu function implemented as below:
+ *  f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
+ *            -DDATA_TYPE_IN=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ * @note Can only take uchar data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
+ *                                                   types : QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                   image
+ * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
+ *                                                   types : QASYMM8
+ * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
+ *                                                   image
+ * @param[out] output_ptr                            Pointer to the destination image. Supported
+ *                                                   data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+ *                                                   (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                   bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+ *                                                   processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                   destination image
+ */
+__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+                            TENSOR3D_DECLARATION(output))
+{
+  // Get pixels pointer
+  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+  VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+  VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+
+  in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
+  alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
+
+  const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
+  const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
+  const VEC_FLOAT outf32 =
+      select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
+  const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+  const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+  VSTORE(VEC_SIZE)
+  (res, 0, (__global uchar *)output.ptr);
+}
+
+#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
+       // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
new file mode 100644
index 000000000..d7ea2e2c4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform reduce max/min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                             const int axis, const int dim)
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
+
+  DATA_TYPE value =
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  for (int i = 1; i < dim; ++i)
+  {
+    indices[axis] = i;
+
+#if OP_CODE == 1 // REDUCE_MAX
+    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+
+#elif OP_CODE == 2 // REDUCE_MIN
+    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+                                                               indices[2], indices[3])));
+
+#else // OP NOT SUPPORTED
+    return;
+
+#endif
+  }
+
+  *((__global DATA_TYPE *)out.ptr) = value;
+}
+
+/** Perform reduce sum/mean
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ *       -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as
+ *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                              const int axis, const int dim)
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+  int indices[4] = {
+      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+      get_global_id(2) / DEPTH_OUT,
+  };
+
+  DATA_TYPE sum_value = (DATA_TYPE)0;
+  for (int i = 0; i < dim; ++i)
+  {
+    indices[axis] = i;
+    sum_value += *(
+        (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+  }
+
+#if OP_CODE == 3 // REDUCE_SUM
+  *((__global DATA_TYPE *)out.ptr) = sum_value;
+
+#elif OP_CODE == 4 // REDUCE_MEAN
+  *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+
+#else // OP NOT SUPPORTED
+  return;
+
+#endif
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 000000000..7367da7fb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
+    defined(WIDTH_IN) && defined(ZERO_VALUE)
+/** Perform space to batch with input of 4D and NCHW format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+ *            e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size.
+ *            e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported
+ *                                                         data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along
+ *                                                         X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along
+ *                                                         Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along
+ *                                                         Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along
+ *                                                         W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the
+ *                                                         source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor.
+ *                                                         Supported data types: same as @p
+ *                                                         input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements
+ *                                                         along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements
+ *                                                         along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements
+ *                                                         along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements
+ *                                                         along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements
+ *                                                         along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of
+ *                                                         elements along X processed per workitem
+ *                                                         (in bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of
+ *                                                         elements along Y processed per workitem
+ *                                                         (in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                         destination tensor
+ */
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+  int block_size_x = *((__global int *)(block_size_ptr));
+  int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+  int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+  int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+
+  int in_index[4] = {
+      0,
+  };
+  in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+  in_index[1] = get_global_id(1) * block_size_y + shift_y -
+                *((__global int *)(padding_size_ptr + padding_size_stride_y));
+  in_index[2] = get_global_id(2) % DEPTH_OUT;
+  in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+
+  if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+  {
+    *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+  }
+  else
+  {
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+        &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+  }
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) &&
+       // defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
+    defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ *            -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size.
+ *            e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size.
+ *            e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ *            -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported
+ *                                                         data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along
+ *                                                         X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along
+ *                                                         Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along
+ *                                                         Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along
+ *                                                         W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the
+ *                                                         source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor.
+ *                                                         Supported data types: same as @p
+ *                                                         input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements
+ *                                                         along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements
+ *                                                         along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements
+ *                                                         along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W
+ *                                                         dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements
+ *                                                         along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements
+ *                                                         along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the
+ *                                                         destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported
+ *                                                         data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of
+ *                                                         elements along X processed per workitem
+ *                                                         (in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y
+ *                                                         dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of
+ *                                                         elements along Y processed per workitem
+ *                                                         (in bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the
+ *                                                         destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+  int block_size_x = *((__global int *)(block_size_ptr));
+  int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+  int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+  int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+  int in_index[4] = {
+      0,
+  };
+  in_index[0] = get_global_id(0) * VEC_SIZE;
+  in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+  in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y -
+                *((__global int *)(padding_size_ptr + padding_size_stride_y));
+  in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+  if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+  {
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+  }
+  else
+  {
+    VSTORE(VEC_SIZE)
+    (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1],
+                                                                      in_index[2], in_index[3])),
+             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+     0, (__global DATA_TYPE *)out.ptr);
+  }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) &&
+       // defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
new file mode 100644
index 000000000..a26e762e8
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+ *            e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+ *            argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+  int out_index[4] = {0};
+  int in_index[4] = {0};
+
+  in_index[0] = get_global_id(0);        // W
+  in_index[1] = get_global_id(1);        // H
+  in_index[2] = get_global_id(2) % Z_IN; // C
+  in_index[3] = get_global_id(2) / Z_IN; // B
+
+  out_index[0] = in_index[0] / BLOCK_SIZE;
+  out_index[1] = in_index[1] / BLOCK_SIZE;
+  out_index[2] =
+      in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+  out_index[3] = in_index[3];
+
+  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+
+#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+ *            e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+ *            argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+ *            -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data
+ *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ *                                                  processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ *                                                  image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data
+ *                                                  types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+ *                                                  (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+ *                                                  bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W
+ *                                                  processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ *                                                  destination image
+ */
+__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+  int out_index[4] = {0};
+  int in_index[4] = {0};
+
+  in_index[0] = get_global_id(0);        // C
+  in_index[1] = get_global_id(1);        // W
+  in_index[2] = get_global_id(2) % Z_IN; // H
+  in_index[3] = get_global_id(2) / Z_IN; // B
+
+  out_index[0] =
+      in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
+  out_index[1] = in_index[1] / BLOCK_SIZE;
+  out_index[2] = in_index[2] / BLOCK_SIZE;
+  out_index[3] = in_index[3];
+
+  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
new file mode 100644
index 000000000..50472e4f9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf,
+                          __global int *in_ind_buf, const int n)
+{
+  int gid = get_global_id(0);
+  int lws = get_local_size(0);
+  int groups = get_num_groups(0);
+  int gws = lws * groups;
+  int iter = n / gws;
+
+  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+
+  for (int i = 0; i < iter; ++i)
+  {
+    int idx = i * gws + gid;
+    in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x);
+    in_ind_buf[idx] = idx;
+  }
+}
+
+__kernel void topkv2_find_first_negative(__global float *out_key_buf,
+                                         __global int *first_negative_idx, int n)
+{
+  int gid = get_global_id(0);
+
+  if (gid == n - 1)
+  {
+    // if the last item is positive, the first negative index is n.
+    if (out_key_buf[gid] > 0.f)
+      *first_negative_idx = n;
+  }
+  else if (gid == 0)
+  {
+    // if the first item is negative, set it 0.
+    if (out_key_buf[gid] < 0.f)
+      *first_negative_idx = 0;
+  }
+  else
+  {
+    // if its left is positive and it is negative, then it is the first negative item.
+    if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f)
+      *first_negative_idx = gid;
+  }
+}
+
+__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf,
+                                       __global float *in_ind_buf, __global float *out_ind_buf,
+                                       __global int *first_negative_idx, int n)
+{
+  int gid = get_global_id(0);
+
+  int num_negs = n - *first_negative_idx;
+  int in_idx;
+
+  if (gid < num_negs)
+  {
+    in_idx = n - 1 - gid;
+  }
+  else
+  {
+    in_idx = gid - num_negs;
+  }
+
+  out_key_buf[gid] = in_key_buf[in_idx];
+  out_ind_buf[gid] = in_ind_buf[in_idx];
+}
+
+__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices),
+                           __global float *out_key_buf, __global int *out_ind_buf, int n)
+{
+  int gid = get_global_id(0);
+
+  Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
+  Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+
+  int idx = n - 1 - gid;
+
+  *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
+  *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
new file mode 100644
index 000000000..9594daf19
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+__global inline float *get_vec_elem(Vector *vec, int idx)
+{
+  return (__global float *)(vec->ptr + idx * vec->stride_x);
+}
+
+__global inline int *get_vec_elem_int(Vector *vec, int idx)
+{
+  return (__global int *)(vec->ptr + idx * vec->stride_x);
+}
+
+// A utility function to swap two elements
+void swap(__global float *a, __global float *b)
+{
+  float t = *a;
+  *a = *b;
+  *b = t;
+}
+
+void swap_idx(__global int *a, __global int *b)
+{
+  int t = *a;
+  *a = *b;
+  *b = t;
+}
+
+/* This function is same in both iterative and recursive*/
+int partition(Vector *arr, __global int *indices, int l, int h)
+{
+  float x = *get_vec_elem(arr, h);
+  int i = (l - 1);
+
+  for (int j = l; j <= h - 1; j++)
+  {
+    if (*get_vec_elem(arr, j) >= x)
+    {
+      i++;
+      swap(get_vec_elem(arr, i), get_vec_elem(arr, j));
+      swap_idx(&indices[i], &indices[j]);
+    }
+  }
+  swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
+  swap_idx(&indices[i + 1], &indices[h]);
+  return (i + 1);
+}
+
+/* A[] --> Array to be sorted,
+   l  --> Starting index,
+   h  --> Ending index */
+void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h)
+{
+  // Create an auxiliary stack
+
+  // initialize top of stack
+  int top = -1;
+
+  // push initial values of l and h to stack
+  stack[++top] = l;
+  stack[++top] = h;
+
+  // Keep popping from stack while is not empty
+  while (top >= 0)
+  {
+    // Pop h and l
+    h = stack[top--];
+    l = stack[top--];
+
+    // Set pivot element at its correct position
+    // in sorted array
+    int p = partition(arr, indices, l, h);
+
+    // If there are elements on left side of pivot,
+    // then push left side to stack
+    if (p - 1 > l)
+    {
+      stack[++top] = l;
+      stack[++top] = p - 1;
+    }
+
+    // If there are elements on right side of pivot,
+    // then push right side to stack
+    if (p + 1 < h)
+    {
+      stack[++top] = p + 1;
+      stack[++top] = h;
+    }
+  }
+}
+
+__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values),
+                               VECTOR_DECLARATION(topk_indices), __global int *indices,
+                               __global int *temp_stack, int k, int n)
+{
+  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
+  Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
+  Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
+
+  for (int i = 0; i < n; ++i)
+  {
+    indices[i] = i;
+  }
+
+  quickSortIterative(&input, indices, temp_stack, 0, n - 1);
+
+  // extract k items.
+  for (int i = 0; i < k; ++i)
+  {
+    *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i);
+    *get_vec_elem_int(&topk_indices, i) = indices[i];
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
new file mode 100644
index 000000000..f6830d229
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// reference:
+// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
+// OpenCL kernel sources for the CLRadixSort class
+// the #include does not exist in OpenCL
+// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
+// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
+// if you find this software usefull you can cite the following work in your reports or articles:
+// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
+// http://hal.archives-ouvertes.fr/hal-00596730
+
+// Reference for floating point radix sort:
+// http://www.codercorner.com/RadixSortRevisited.htm
+
+// compute the histogram for each radix and each virtual processor for the pass
+__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
+                                  const int pass, __local int *loc_histo, const int n)
+{
+  int it = get_local_id(0);  // i local number of the processor
+  int ig = get_global_id(0); // global number = i + g I
+
+  int gr = get_group_id(0); // g group number
+
+  int groups = get_num_groups(0);
+  int items = get_local_size(0);
+
+  // set the local histograms to zero
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    loc_histo[ir * items + it] = 0;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // range of keys that are analyzed by the work item
+  int size = n / groups / items; // size of the sub-list
+  int start = ig * size;         // beginning of the sub-list
+
+  unsigned int key;
+  int shortkey, k;
+
+  // compute the index
+  // the computation depends on the transposition
+  for (int j = 0; j < size; j++)
+  {
+#ifdef TRANSPOSE
+    k = groups * items * j + ig;
+#else
+    k = j + start;
+#endif
+
+    key = *((__global unsigned int *)(in_key_buf + k));
+
+    // extract the group of _BITS bits of the pass
+    // the result is in the range 0.._RADIX-1
+    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+    // increment the local histogram
+    loc_histo[shortkey * items + it]++;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // copy the local histogram to the global one
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// initial transpose of the list for improving
+// coalescent memory access
+__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
+                        const int nbrow, const __global int *inperm, __global int *outperm,
+                        __local int *blockmat, __local int *blockperm, const int tilesize)
+{
+
+  int i0 = get_global_id(0) * tilesize; // first row index
+  int j = get_global_id(1);             // column index
+
+  int jloc = get_local_id(1); // local column index
+
+  // fill the cache
+  for (int iloc = 0; iloc < tilesize; iloc++)
+  {
+    int k = (i0 + iloc) * nbcol + j; // position in the matrix
+    blockmat[iloc * tilesize + jloc] = invect[k];
+#ifdef PERMUT
+    blockperm[iloc * tilesize + jloc] = inperm[k];
+#endif
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // first row index in the transpose
+  int j0 = get_group_id(1) * tilesize;
+
+  // put the cache at the good place
+  for (int iloc = 0; iloc < tilesize; iloc++)
+  {
+    int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
+    outvect[kt] = blockmat[jloc * tilesize + iloc];
+#ifdef PERMUT
+    outperm[kt] = blockperm[jloc * tilesize + iloc];
+#endif
+  }
+}
+
+// each virtual processor reorders its data using the scanned histogram
+__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
+                                __global int *d_Histograms, const int pass,
+                                __global int *indices_in, __global int *indices_out,
+                                __local int *loc_histo, const int n)
+{
+
+  int it = get_local_id(0);
+  int ig = get_global_id(0);
+
+  int gr = get_group_id(0);
+  int groups = get_num_groups(0);
+  int items = get_local_size(0);
+
+  int start = ig * (n / groups / items);
+  int size = n / groups / items;
+
+  // take the histogram in the cache
+  for (int ir = 0; ir < _RADIX; ir++)
+  {
+    loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  int newpos, shortkey, k, newpost;
+  unsigned int key;
+
+  for (int j = 0; j < size; j++)
+  {
+#ifdef TRANSPOSE
+    k = groups * items * j + ig;
+#else
+    k = j + start;
+#endif
+    float org_value = in_key[k];
+    key = *(__global unsigned int *)(in_key + k);
+    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+    newpos = loc_histo[shortkey * items + it];
+
+#ifdef TRANSPOSE
+    int ignew, jnew;
+    ignew = newpos / (n / groups / items);
+    jnew = newpos % (n / groups / items);
+    newpost = jnew * (groups * items) + ignew;
+#else
+    newpost = newpos;
+#endif
+
+    // d_outKeys[newpost]= key;  // killing line !!!
+    out_key[newpost] = org_value;
+
+#ifdef PERMUT
+    indices_out[newpost] = indices_in[k];
+#endif
+
+    newpos++;
+    loc_histo[shortkey * items + it] = newpos;
+  }
+}
+
+// perform a parallel prefix sum (a scan) on the local histograms
+// (see Blelloch 1990) each workitem worries about two memories
+// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
+                                       __global int *globsum)
+{
+  int it = get_local_id(0);
+  int ig = get_global_id(0);
+  int decale = 1;
+  int n = get_local_size(0) * 2;
+  int gr = get_group_id(0);
+
+  // load input into local memory
+  // up sweep phase
+  temp[2 * it] = histo[2 * ig];
+  temp[2 * it + 1] = histo[2 * ig + 1];
+
+  // parallel prefix sum (algorithm of Blelloch 1990)
+  for (int d = n >> 1; d > 0; d >>= 1)
+  {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (it < d)
+    {
+      int ai = decale * (2 * it + 1) - 1;
+      int bi = decale * (2 * it + 2) - 1;
+      temp[bi] += temp[ai];
+    }
+    decale *= 2;
+  }
+
+  // store the last element in the global sum vector
+  // (maybe used in the next step for constructing the global scan)
+  // clear the last element
+  if (it == 0)
+  {
+    globsum[gr] = temp[n - 1];
+    temp[n - 1] = 0;
+  }
+
+  // down sweep phase
+  for (int d = 1; d < n; d *= 2)
+  {
+    decale >>= 1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (it < d)
+    {
+      int ai = decale * (2 * it + 1) - 1;
+      int bi = decale * (2 * it + 2) - 1;
+
+      int t = temp[ai];
+      temp[ai] = temp[bi];
+      temp[bi] += t;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // write results to device memory
+
+  histo[2 * ig] = temp[2 * it];
+  histo[2 * ig + 1] = temp[2 * it + 1];
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// use the global sum for updating the local histograms
+// each work item updates two values
+__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
+{
+  int ig = get_global_id(0);
+  int gr = get_group_id(0);
+
+  int s;
+
+  s = globsum[gr];
+
+  // write results to device memory
+  histo[2 * ig] += s;
+  histo[2 * ig + 1] += s;
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
new file mode 100644
index 000000000..7f4b5b0df
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                          ArgOperation /*op*/)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
+                                        DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
+                                      output->tensor_shape().num_dimensions(),
+                                  "Input's rank is not same with output");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match axis");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+  return Status{};
+}
+
+} // namespace
+
+CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+                                     ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  _input = input;
+  _output = output;
+  _axis = axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+  // Construct kernel and set op_code based on type of ArgOperation as specified by object op
+  std::string kernel_name = "arg_op";
+  int op_code = 0;
+  if (op == ArgOperation::MAX)
+  {
+    op_code = 1;
+  }
+  else if (op == ArgOperation::MIN)
+  {
+    op_code = 2;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                      const uint32_t axis, ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+  return Status{};
+}
+
+void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..c14e73634
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+                           const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                        ICLTensor *output, BinaryLogicalOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "binary_logical_op";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+  int op_code = 0;
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      op_code = 1;
+      break;
+    case BinaryLogicalOperation::OR:
+      op_code = 2;
+      break;
+    default:
+      throw std::runtime_error("Operation not supported, yet");
+  }
+
+  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  update_window_and_padding(win_input1, input1_access) ||
+      update_window_and_padding(win_input2, input2_access) ||
+      update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
new file mode 100644
index 000000000..35f607bd0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Set kernel build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_OUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  // Create kernel
+  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+  {
+    const float scale_in = input->info()->quantization_info().scale;
+    const int offset_in = input->info()->quantization_info().offset;
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+
+    _kernel = static_cast<cl::Kernel>(
+        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
+  }
+  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+  {
+    const float scale_in = output->info()->quantization_info().scale;
+    const int offset_in = output->info()->quantization_info().offset;
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+
+    _kernel = static_cast<cl::Kernel>(
+        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
+  }
+  else
+  {
+    build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
+    _kernel = static_cast<cl::Kernel>(
+        CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
+  }
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..2a3433c2b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+// TODO Use this validation function
+#if 0
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
+                                  "Output width should be equal to (Input width * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
+                                  "Output height should be equal to (Input height * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
+                                  "Input depth should be divisible by (block size * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      output->dimension(2) != input->dimension(2) / (block_size * block_size),
+      "Output depth should be equal to (Input depth / (block size * block size))");
+
+  return Status{};
+}
+#endif
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+  // TODO Add validation of data_layout
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  auto layout_out = output->info()->data_layout();
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+  auto depth = output->info()->dimension(index_depth);
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+  build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+      "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const ICLTensor *lookups)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "embedding_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
new file mode 100644
index 000000000..718f615f9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+
+inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
+                                 const ITensorInfo *output, int axis)
+{
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices,
+                                                        ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+  std::unique_ptr<ITensorInfo> output_info = input->clone();
+  output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->tensor_shape(), indices->tensor_shape(), actual_axis));
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
+
+  // Create window
+  Window win = calculate_max_window(*output, Steps());
+  output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+  return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+CLGatherExKernel::CLGatherExKernel()
+    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+{
+}
+
+void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices,
+                                 ICLTensor *output, int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), indices->info(), output->info(), axis));
+
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+  _input = input;
+  _output = output;
+  _indices = indices;
+  _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
+
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DOUTPUT_DIM_Z=" +
+                        support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+  build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+  build_opts.add_option("-DINDICES_DIM=" +
+                        support::cpp11::to_string(indices->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+  ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                                  const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                            indices->clone().get(),
+                                                            output->clone().get(), axis)
+                                  .first);
+  return Status{};
+}
+
+void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
+  unsigned int idx = 0;
+  add_4D_tensor_argument(idx, _input, window_collapsed);
+  add_3D_tensor_argument(idx, _indices, window_collapsed);
+  add_4D_tensor_argument(idx, _output, window_collapsed);
+  enqueue(queue, *this, window_collapsed, lws_hint());
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..31e98c9a8
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+{
+  // DO NOTHING
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Output's shape was not set");
+
+  ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) ||
+                       output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Make _lookup_indices tensor
+  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+  _lookup_indices->allocator()->init(
+      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+  _lookup_indices->allocator()->allocate();
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "hashtable_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const_cast<ICLTensor *>(_lookups)->map(queue);
+  const_cast<ICLTensor *>(_keys)->map(queue);
+  _hits->map(queue);
+  _lookup_indices->map(queue);
+
+  // Set values of hits
+  const int32_t *lookups_buf =
+      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+  std::map<int32_t, size_t> key_map;
+  const size_t keys_num = _keys->info()->dimension(0);
+  for (size_t key_index = 0; key_index < keys_num; key_index++)
+  {
+    key_map[keys_buf[key_index]] = key_index;
+  }
+
+  const size_t lookups_num = _lookups->info()->dimension(0);
+  for (size_t i = 0; i < lookups_num; ++i)
+  {
+    const auto lookup_value = lookups_buf[i];
+    const auto it = key_map.find(lookup_value);
+    if (it != key_map.end())
+    {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+      if (it->second >= lookups_num)
+        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+      hits_buf[i] = static_cast<uint8_t>(1);
+    }
+    else
+    {
+      lookup_indices_buf[i] = -1;
+      hits_buf[i] = static_cast<uint8_t>(0);
+    }
+  }
+
+  const_cast<ICLTensor *>(_lookups)->unmap(queue);
+  const_cast<ICLTensor *>(_keys)->unmap(queue);
+  _hits->unmap(queue);
+  _lookup_indices->unmap(queue);
+
+  Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, win);
+    add_4D_tensor_argument(idx, _output, win);
+    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+    enqueue(queue, *this, win);
+  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..5db414f62
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_UNUSED(gamma);
+  ARM_COMPUTE_UNUSED(beta);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+
+  if (output != nullptr && output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                    "Input and output have different number of channels");
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // We handle the planes manually
+  Window win = calculate_max_window(*input, Steps(1));
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+  // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
+      _run_in_place(false)
+{
+}
+
+void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output,
+                                                     ICLTensor *gamma, ICLTensor *beta,
+                                                     float epsilon)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _input = input;
+  _output = output == nullptr ? input : output;
+  _gamma = gamma;
+  _beta = beta;
+  _epsilon = epsilon;
+
+  _run_in_place = (output == nullptr) || (output == input);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(),
+                                                gamma ? gamma->info() : nullptr,
+                                                beta ? beta->info() : nullptr, epsilon));
+  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DVEC_SIZE=" +
+                        support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
+  build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
+  build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+  build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
+  build_opts.add_option_if(gamma, "-DGAMMA");
+  build_opts.add_option_if(beta, "-DBETA");
+  build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+  build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+  ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+                                                      const ITensorInfo *output,
+                                                      const ITensorInfo *gamma,
+                                                      const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+  return Status{};
+}
+
+void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window collapsed_window = window.collapse(window, Window::DimZ);
+
+  // We will process the planes together
+  if (_input->info()->data_layout() == DataLayout::NCHW)
+  {
+    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+    collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+  }
+  else
+  {
+    collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1));
+  }
+
+  Window vec_window;
+  vec_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  unsigned int idx = 0;
+  add_4D_tensor_argument(idx, _input, collapsed_window);
+  if (!_run_in_place)
+  {
+    add_4D_tensor_argument(idx, _output, collapsed_window);
+  }
+  if (_gamma)
+  {
+    add_1D_tensor_argument(idx, _gamma, vec_window);
+  }
+  if (_beta)
+  {
+    add_1D_tensor_argument(idx, _beta, vec_window);
+  }
+
+  enqueue(queue, *this, collapsed_window, lws_hint());
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..ecfe05a51
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+  // Configure window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..e7d587029
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                       DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+                                                       DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
+
+  _input = input;
+  _alpha = alpha;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "prelu";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+  {
+    build_opts.emplace("-DOFF_IN=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+    build_opts.emplace("-DOFF_ALPHA=" +
+                       support::cpp11::to_string(alpha->info()->quantization_info().offset));
+    build_opts.emplace("-DOFF_OUT=" +
+                       support::cpp11::to_string(output->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN=" +
+                       support::cpp11::to_string(input->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_ALPHA=" +
+                       support::cpp11::to_string(alpha->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_OUT=" +
+                       support::cpp11::to_string(output->info()->quantization_info().scale));
+    kernel_name += "_qasymm8";
+  }
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input->info()->data_type() == DataType::F32 ||
+             alpha->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  update_window_and_padding(win_input1, input1_access) ||
+      update_window_and_padding(win_input2, input2_access) ||
+      update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input->info()->tensor_shape();
+  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice_input1);
+    add_3D_tensor_argument(idx, _alpha, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..24e89db28
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                          ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32, DataType::S32);
+  if (op == ReduceOperation::SUM)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+                                    "Not support QASYMM8, yet");
+  }
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match axis");
+
+  return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const uint32_t axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  _input = input;
+  _output = output;
+  _axis = axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+  // Construct kernel name
+  std::string kernel_name;
+  int op_code = 0;
+  if (op == ReduceOperation::MAX)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 1;
+  }
+  else if (op == ReduceOperation::MIN)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 2;
+  }
+  else if (op == ReduceOperation::SUM)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 3;
+  }
+  else if (op == ReduceOperation::MEAN)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 4;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const uint32_t axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+  return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+  // Support dimensions up to 4
+  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+  // of input and output are the same
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+  idx = 0;
+  add_4D_tensor_argument(idx, _input, slice_in);
+  add_4D_tensor_argument(idx, _output, slice_out);
+  enqueue(queue, *this, slice_out, lws_hint());
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..f7836b6cd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+                          const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+                                  "The number of dimensions of input should be equal to output");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+                                  "The input and output layouts are different!");
+
+  // TODO Support other cases
+  if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+                                  "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+  if (input->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel()
+{
+  // DO NOTHING
+}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                       const ICLTensor *padding_size, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+  _input = input;
+  _block_size = block_size;
+  _padding_size = padding_size;
+  _output = output;
+
+  // Set kernel build options
+  // TODO Support other cases
+  std::string kernel_name = "space_to_batch_4d";
+  std::set<std::string> build_opts;
+  Window win;
+
+  if (input->info()->data_layout() == DataLayout::NCHW)
+  {
+    kernel_name += "_nchw";
+    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+    win = calculate_max_window(*output->info(), Steps());
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+  }
+  else if (input->info()->data_layout() == DataLayout::NHWC)
+  {
+    kernel_name += "_nhwc";
+    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    input_access.set_valid_region(win, output->info()->valid_region());
+
+    if (window_changed)
+    {
+      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported layout");
+  }
+
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure kernel window
+  ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+  const_cast<ICLTensor *>(_block_size)->map(queue);
+  const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+  const size_t num_dimensions = _input->info()->num_dimensions();
+  const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+  uint32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+  for (size_t i = 0; i < num_spacial_dimensions; ++i)
+  {
+    const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+    const int32_t padding_size_pre =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+    const int32_t padding_size_post =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+    ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+    ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+                             "Padding size should be greater than or equal to 0");
+
+    if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(i) !=
+              (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+    else
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+              (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+               padding_size_pre + padding_size_post) /
+                  block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+
+    batch_size *= block_size;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(
+      _output->info()->dimension(num_dimensions - 1) != batch_size,
+      "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+  const_cast<ICLTensor *>(_block_size)->unmap(queue);
+  const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Set block size window
+  Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+  // Set padding size window
+  Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    add_1D_tensor_argument(idx, _block_size, win_block);
+    add_2D_tensor_argument(idx, _padding_size, win_padding);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..b085192a2
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
+                                  "Input batch should be equal to Output batch");
+
+  auto layout_out = input->data_layout();
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+
+  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+  auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
+  auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
+      "Output depth should be equal to (input depth * block size *block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
+                                      (input->dimension(index_height) % block_size),
+                                  "Input height and width should be divisible by block size");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
+          (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
+      "Output height and width should be equal to "
+      "input_height/blocksize and input_width/blocksize respectively");
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  auto layout_out = input->info()->data_layout();
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+  auto depth = input->info()->dimension(index_depth);
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
+  build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+      "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
new file mode 100644
index 000000000..4f2b388c9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+namespace arm_compute
+{
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
+
+void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
+                               cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  _input = input;
+  _topk_values = topk_values;
+  _topk_indices = topk_indices;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
+
+  unsigned int idx = 3 * num_arguments_per_1D_tensor();
+  _kernel.setArg(idx++, *indices);
+  _kernel.setArg(idx++, *temp_stack);
+  _kernel.setArg<cl_int>(idx++, k);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, 1, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _input, window);
+  add_1D_tensor_argument(idx, _topk_values, window);
+  add_1D_tensor_argument(idx, _topk_indices, window);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
+
+void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
+                             int n)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  _input = input;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
+
+  unsigned int idx = num_arguments_per_1D_tensor();
+  _kernel.setArg(idx++, *in_key_buf);
+  _kernel.setArg(idx++, *in_ind_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _input, window);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This kernel makes a histogram of radix for each work item.
+CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
+
+void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
+
+  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
+
+  unsigned int idx = 1;
+  _kernel.setArg(idx++, *hist_buf);
+
+  idx = 3;
+  _kernel.setArg(idx++, loc_histo_size, nullptr);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  _kernel.setArg(0, *_in_key_buf);
+  _kernel.setArg<cl_int>(2, _pass);
+
+  cl::NDRange lws = cl::NDRange(_ITEMS, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
+
+void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+  int temp_size =
+      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *hist_buf);
+  _kernel.setArg(idx++, temp_size, nullptr);
+  _kernel.setArg(idx++, *glob_sum_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
+
+void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
+                                               int bits)
+{
+  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
+
+  int temp_size =
+      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *glob_sum_buf);
+  _kernel.setArg(idx++, temp_size, nullptr);
+  _kernel.setArg(idx++, *temp_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
+
+void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *hist_buf);
+  _kernel.setArg(idx++, *glob_sum_buf);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLRadixSortReorder::CLRadixSortReorder()
+    : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
+      _out_ind_buf(nullptr)
+{
+}
+
+void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
+{
+  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  unsigned int radix = 1 << bits;
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
+  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
+  build_opts.emplace("-DPERMUT=1");
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
+
+  unsigned int idx = 2;
+  _kernel.setArg(idx++, *hist_buf);
+
+  idx = 6;
+  _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
+  cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
+
+  _kernel.setArg(0, *_in_key_buf);
+  _kernel.setArg(1, *_out_key_buf);
+  _kernel.setArg<cl_int>(3, _pass);
+  _kernel.setArg(4, *_in_ind_buf);
+  _kernel.setArg(5, *_out_ind_buf);
+
+  enqueue(queue, *this, window, lws);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
+
+void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
+
+  unsigned int idx = 1;
+  _kernel.setArg(idx++, *first_negative_idx_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *_out_key_buf);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
+    : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
+{
+  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
+  ARM_COMPUTE_ERROR_ON(n == 0);
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
+
+  unsigned int idx = 4;
+  _kernel.setArg(idx++, *first_negative_idx_buf);
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, n, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  _kernel.setArg(idx++, *_in_key_buf);
+  _kernel.setArg(idx++, *_out_key_buf);
+  _kernel.setArg(idx++, *_in_ind_buf);
+  _kernel.setArg(idx++, *_out_ind_buf);
+
+  enqueue(queue, *this, window);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+CLTopKV2Store::CLTopKV2Store()
+    : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
+{
+}
+
+void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
+{
+  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
+  ARM_COMPUTE_ERROR_ON(k == 0);
+  ARM_COMPUTE_ERROR_ON(k > n);
+
+  _values = values;
+  _indices = indices;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
+
+  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
+  _kernel.setArg<cl_int>(idx++, n);
+
+  // Configure kernel window
+  Window win;
+  win.set(0, Window::Dimension(0, k, 1));
+  ICLKernel::configure_internal(win);
+}
+
+void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
+{
+  _out_key_buf = out_key_buf;
+  _out_ind_buf = out_ind_buf;
+}
+
+void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  unsigned int idx = 0;
+  add_1D_tensor_argument(idx, _values, window);
+  add_1D_tensor_argument(idx, _indices, window);
+  _kernel.setArg(idx++, *_out_key_buf);
+  _kernel.setArg(idx++, *_out_ind_buf);
+
+  enqueue(queue, *this, window);
+}
+
+} // namespace arm_compute
+#endif // Disable GPU implementation
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
new file mode 100644
index 000000000..6cc8d9d13
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
+    : _input(nullptr), _output(nullptr), _inner_border(), _info()
+{
+}
+
+Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
+                                                    const ITensorInfo *output,
+                                                    const BorderSize &inner_border,
+                                                    const PadStrideInfo &info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+  for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
+                                  "inner_border_right must be smaller that stride_x");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
+                                  "inner_border_top must be smaller that stride_y");
+
+  return Status{};
+}
+
+void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                                   const BorderSize &inner_border,
+                                                   const PadStrideInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  _input = input;
+  _output = output;
+  _inner_border = inner_border;
+  _info = info;
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
+      input->info(), output->info(), inner_border, info));
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
+
+  constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const DataLayout data_layout = _input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  const int out_start_x = _info.pad_left();
+  const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
+                        _info.pad_right() + _info.stride().first - 1;
+  const int out_step_x = _info.stride().first;
+
+  const int out_start_y = _inner_border.top + _info.pad_top();
+  const int out_end_y =
+      _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
+  const int out_step_y = _info.stride().second;
+
+  switch (data_layout)
+  {
+    case DataLayout::NCHW:
+    {
+      Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+      Window slice_out = collapsed.first_slice_window_3D();
+      slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+      slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+      Window slice_in = collapsed.first_slice_window_3D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+      } while (collapsed.slide_window_slice_3D(slice_in) &&
+               collapsed.slide_window_slice_3D(slice_out));
+      break;
+    }
+    case DataLayout::NHWC:
+    {
+      // NOTE: not collapsing in NHWC
+      Window slice_out = window.first_slice_window_3D();
+      slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+      slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+      Window slice_in = window.first_slice_window_3D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+      } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data layout");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
new file mode 100644
index 000000000..8ac667ceb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
+
+bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
+
+void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
+                                    const PadStrideInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  _input = input;
+  _output = output;
+  _info = info;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICPPKernel::configure(win);
+}
+
+void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+  // Initialize _scaled_output buffer
+  const int width_scaled = _output->info()->dimension(0);
+  const int height_scaled = _output->info()->dimension(1);
+  const int stride_x = _info.stride().first;
+  const int stride_y = _info.stride().second;
+  const int start_x = _info.pad_left();
+  const int start_y = _info.pad_top();
+  const int end_y = height_scaled - _info.pad_bottom();
+  const int end_x = width_scaled - _info.pad_top();
+  const size_t element_size = _input->info()->element_size();
+
+  // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
+  const uint8_t fill_value =
+      _output->info()->data_type() == DataType::QASYMM8
+          ? utility::clamp<uint8_t>(_output->info()->quantization_info().offset)
+          : 0;
+  // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
+  // values in a buffer of uint8_ts
+  std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
+
+  // Create window
+  Window window_out(window);
+  window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
+  window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
+
+  // Create iterators
+  Iterator in(_input, window);
+  Iterator out(_output, window_out);
+
+  execute_window_loop(
+      window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
new file mode 100644
index 000000000..4508f5800
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+
+#include <algorithm>
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+namespace
+{
+void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out)
+{
+  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
+  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
+  vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+using namespace arm_compute;
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op_templ(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
+                          OutputScalarType *, const bool),
+    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
+                     OutputScalarType *))
+{
+  // Create input windows
+  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+  // Clear X Dimension on execution window as we handle manually
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+  if (is_broadcast_across_x)
+  {
+    const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+    Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+    Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+    const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+    const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+    // Clear X Dimension on execution window as we handle manually
+    non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator broadcast_input(broadcast_tensor, broadcast_win);
+    Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+    Iterator output(out, win);
+
+    execute_window_loop(win,
+                        [&](const Coordinates &) {
+                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                          const auto non_broadcast_input_ptr =
+                              reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                          const InputScalarType broadcast_value =
+                              *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+                          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
+                                                    non_broadcast_input_ptr, broadcast_value,
+                                                    output_ptr, !is_broadcast_input_2);
+                          for (; x < window_end_x; ++x)
+                          {
+                            const auto a = *(non_broadcast_input_ptr + x);
+                            *(output_ptr + x) =
+                                (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                               !is_broadcast_input_2 ? a : broadcast_value);
+                          }
+                        },
+                        broadcast_input, non_broadcast_input, output);
+  }
+  else
+  {
+    // Clear X Dimension on execution window as we handle manually
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    execute_window_loop(win,
+                        [&](const Coordinates &) {
+                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                          const auto input1_ptr =
+                              reinterpret_cast<const InputScalarType *>(input1.ptr());
+                          const auto input2_ptr =
+                              reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
+                                               input1_ptr, input2_ptr, output_ptr);
+                          for (; x < window_end_x; ++x)
+                          {
+                            const auto a = *(input1_ptr + x);
+                            const auto b = *(input2_ptr + x);
+                            *(output_ptr + x) = (*scalar_func)(a, b);
+                          }
+                        },
+                        input1, input2, output);
+  }
+}
+
+} // namespace
+
+namespace arm_compute
+{
+
+float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
+                             const float32x4_t &scale)
+{
+  qasymm8x16_t x = vld1q_u8(input1_ptr);
+  const float32x4x4_t out = {{
+      vmulq_f32(
+          vcvtq_f32_s32(vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+          scale),
+      vmulq_f32(
+          vcvtq_f32_s32(vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+          scale),
+      vmulq_f32(
+          vcvtq_f32_s32(vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+          scale),
+      vmulq_f32(
+          vcvtq_f32_s32(vsubq_s32(
+              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+          scale),
+  }};
+  return out;
+}
+
+void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
+                     const float32x4_t &invscale)
+{
+  int32x4x4_t out = {{
+      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+      vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+  }};
+  store_quantized_int32(output_ptr, out);
+}
+
+float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale)
+{
+  const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
+  const int32x4_t voffset = vdupq_n_s32(offset);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+  const float32x4x4_t broadcast_vector = {{
+      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
+                                            vmovl_u8(vget_low_u8(broadcast_value_vec))))),
+                                        voffset)),
+                vscale),
+      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
+                                            vmovl_u8(vget_low_u8(broadcast_value_vec))))),
+                                        voffset)),
+                vscale),
+      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
+                                            vmovl_u8(vget_high_u8(broadcast_value_vec))))),
+                                        voffset)),
+                vscale),
+      vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
+                                            vmovl_u8(vget_high_u8(broadcast_value_vec))))),
+                                        voffset)),
+                vscale),
+  }};
+  return broadcast_vector;
+}
+
+void elementwise_op_quantized(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+    uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
+    int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
+                          float32x4_t, float32x4_t, float32x4_t, const bool),
+    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
+                     int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
+{
+  // Create input windows
+  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+  // Clear X Dimension on execution window as we handle manually
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  const int window_step_x = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+  const float output_scale = out->info()->quantization_info().scale;
+  const int output_offset = out->info()->quantization_info().offset;
+
+  // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from
+  // zero)
+  const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f);
+  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
+
+  if (is_broadcast_across_x)
+  {
+    // Select the broadcast input on the X axis
+    const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+    Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+    Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+    const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+    const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+    const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info();
+    const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
+
+    const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+    const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+    // Clear X Dimension on execution window as we handle manually
+    non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator broadcast_input(broadcast_tensor, broadcast_win);
+    Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &) {
+          const auto non_broadcast_input_ptr =
+              reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+          const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+          const float32x4x4_t broadcast_vector =
+              dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale);
+
+          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
+                                    non_broadcast_input_ptr, broadcast_vector, output_ptr,
+                                    voffset_non_broadcast, vscale_non_broadcast, voffseto,
+                                    invvscaleo, !is_broadcast_input_2);
+          for (; x < window_end_x; ++x)
+          {
+            const float afs =
+                scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale,
+                                 non_broadcast_qinfo.offset);
+            const float bfs =
+                scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset);
+            *(output_ptr + x) =
+                (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
+                               out->info()->quantization_info());
+          }
+        },
+        broadcast_input, non_broadcast_input, output);
+  }
+  else
+  {
+    // Input1 quantization info
+    const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
+    const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
+
+    // Input2 quantization info
+    const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
+    const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
+
+    // Clear X Dimension on execution window as we handle manually
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
+    const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &) {
+          const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+          const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+          int x =
+              (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr,
+                           output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+          for (; x < window_end_x; ++x)
+          {
+            const float afs =
+                scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset);
+            const float bfs =
+                scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset);
+            *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info());
+          }
+        },
+        input1, input2, output);
+  }
+}
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    float (*scalar_func)(const float &, const float &),
+                    int (*broadcast_func)(int, int, int, const float *, const float &, float *,
+                                          const bool),
+                    int (*neon_func)(int, int, int, const float *, const float *, float *))
+{
+  elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func,
+                                                  broadcast_func, neon_func);
+}
+
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
+                    int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
+                                          uint8_t *, const bool),
+                    int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *))
+{
+  elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func,
+                                                     broadcast_func, neon_func);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
new file mode 100644
index 000000000..d2f42de53
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation op, typename ScalarType>
+inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+  auto res = ScalarType(0);
+
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      res = a & b;
+      break;
+    case BinaryLogicalOperation::OR:
+      res = a | b;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <BinaryLogicalOperation op, typename VectorType>
+inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b)
+{
+  VectorType res = {0, 0, 0, 0};
+
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      res = wrapper::vand(a, b);
+      break;
+    case BinaryLogicalOperation::OR:
+      res = wrapper::vorr(a, b);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <BinaryLogicalOperation op>
+inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
+{
+  uint8x16x4_t out = {{
+      elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
+      elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
+  }};
+  return out;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_logic_op_broadcast(const VectorType &a,
+                                                 const ScalarType &broadcast_value,
+                                                 const bool reorder)
+{
+  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+  return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                     const ScalarType *input1_ptr, const ScalarType *input2_ptr,
+                                     ScalarType *output_ptr)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq(input1_ptr + x);
+    const auto b = wrapper::vloadq(input2_ptr + x);
+    wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b));
+  }
+  return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x,
+                                               int window_step_x,
+                                               const ScalarType *non_broadcast_input_ptr,
+                                               const ScalarType &broadcast_value,
+                                               ScalarType *output_ptr, const bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+    wrapper::vstore(output_ptr + x,
+                    elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder));
+  }
+  return x;
+}
+
+template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
+void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+                          const Window &window)
+{
+  elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>,
+                 &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>,
+                 &elementwise_logic_op_loop<op, ScalarType, VectorType>);
+}
+
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
+    const ITensor *input1, const ITensor *input2, ITensor *output,
+    std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+{
+  std::string function_to_call("op_");
+  function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(output->info()->data_type());
+
+  auto it = map_function.find(function_to_call);
+
+  if (it != map_function.end())
+  {
+    auto func = it->second;
+    return [func](const ITensor *input1, const ITensor *input2, ITensor *output,
+                  const Window &window) { func(input1, input2, output, window); };
+  }
+  return nullptr;
+}
+
+template <BinaryLogicalOperation op>
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
+      {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+      {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+
+  return configure_func(input1, input2, output, map_function);
+}
+
+void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1,
+                                               const ITensor *input2, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+  configure_common(input1, input2, output);
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output);
+      break;
+    case BinaryLogicalOperation::OR:
+      _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1,
+                                                          const ITensorInfo &input2,
+                                                          const ITensorInfo &output)
+{
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+  }
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+  const TensorShape out_shape =
+      TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
+                                                const ITensorInfo *input1,
+                                                const ITensorInfo *input2,
+                                                const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+  return Status{};
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
new file mode 100644
index 000000000..7e4fc129b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          SubDataType input_subtype)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
+                                                       DataType::QASYMM8, DataType::U32,
+                                                       DataType::S32, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
+                              input->data_type() != DataType::U8);
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+                                                         DataType::QASYMM8, DataType::U32,
+                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+
+  // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+  return std::make_tuple(Status{}, win);
+}
+
+typedef struct bool8x16
+{
+  uint8x16_t val;
+} bool8x16_t;
+
+static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
+
+template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
+template <> inline uint8x16_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+}
+
+template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+  const uint32x4x4_t ret = {{
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
+  }};
+
+  return ret;
+}
+
+template <> inline int32x4x4_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+  const int32x4x4_t ret = {{
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const bool8x16_t &v)
+{
+  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+
+  const float32x4x4_t ret = {{
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+  }};
+
+  return ret;
+}
+
+template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
+{
+  const uint32x4x4_t ret = {{
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
+      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
+      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
+  }};
+
+  return ret;
+}
+
+template <> inline int32x4x4_t vcast(const uint8x16_t &v)
+{
+  const int32x4x4_t ret = {{
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const uint8x16_t &v)
+{
+  const float32x4x4_t ret = {{
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+  }};
+
+  return ret;
+}
+
+template <> inline uint8x16_t vcast(const int32x4x4_t &v)
+{
+  // Saturate cast
+  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
+                     vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
+}
+
+template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
+{
+  // Saturate cast
+  const uint32x4x4_t ret = {{
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
+      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
+                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
+{
+  const float32x4x4_t ret = {{
+      vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
+      vcvtq_f32_s32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
+{
+  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
+                     vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
+}
+
+template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
+{
+  const int32x4x4_t ret = {{
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
+      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
+                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
+  }};
+
+  return ret;
+}
+
+template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
+{
+  const float32x4x4_t ret = {{
+      vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
+      vcvtq_f32_u32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <> inline uint8x16_t vcast(const float32x4x4_t &v)
+{
+  // Saturate cast
+  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
+                                             vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
+                     vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
+                                             vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
+}
+
+template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
+{
+  const uint32x4x4_t ret = {{
+      vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
+      vcvtq_u32_f32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
+{
+  const int32x4x4_t ret = {{
+      vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
+      vcvtq_s32_f32(v.val[3]),
+  }};
+
+  return ret;
+}
+
+template <typename T> struct cast_vector;
+template <> struct cast_vector<bool>
+{
+  using type = bool8x16_t;
+};
+template <> struct cast_vector<uint8_t>
+{
+  using type = uint8x16_t;
+};
+template <> struct cast_vector<uint32_t>
+{
+  using type = uint32x4x4_t;
+};
+template <> struct cast_vector<int32_t>
+{
+  using type = int32x4x4_t;
+};
+template <> struct cast_vector<float>
+{
+  using type = float32x4x4_t;
+};
+
+template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
+{
+  wrapper::vstore(ptr, v.val[0]);
+  wrapper::vstore(ptr + 4, v.val[1]);
+  wrapper::vstore(ptr + 8, v.val[2]);
+  wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
+{
+  wrapper::vstore(ptr, v);
+}
+
+inline bool8x16_t vloadq(const bool *ptr)
+{
+  bool8x16_t ret;
+  ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
+  return ret;
+}
+
+template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
+{
+  return wrapper::vloadq(ptr);
+}
+
+template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
+{
+  return vloadq(ptr);
+}
+
+template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
+{
+  return vld4q_u32(ptr);
+}
+
+template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
+{
+  return vld4q_s32(ptr);
+}
+
+template <> inline typename cast_vector<float>::type load_input(const float *ptr)
+{
+  return vld4q_f32(ptr);
+}
+
+template <typename T> inline T get_value(const T *ptr) { return *ptr; }
+
+template <> inline bool get_value(const bool *ptr)
+{
+  bool ret = (*ptr != 0);
+  return ret;
+}
+
+template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
+{
+  const int window_step_x = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  // Create iterators
+  Iterator in(input, win_collapsed);
+  Iterator out(output, win_collapsed);
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
+
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          using from_vector = typename cast_vector<FromT>::type;
+          const from_vector vin = load_input(in_ptr + x);
+
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              using to_vector = typename cast_vector<uint8_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+              const auto vf = vcast<to_vector, from_vector>(vin);
+              const auto vout = vquantize(vf, qinfo_out);
+              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::U32:
+            {
+              using to_vector = typename cast_vector<uint32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::S32:
+            {
+              using to_vector = typename cast_vector<int32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::F32:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          FromT val = get_value(in_ptr + x);
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+              const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy);
+              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
+              break;
+            }
+            case DataType::U32:
+            {
+              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+              break;
+            }
+            case DataType::S32:
+            {
+              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+              break;
+            }
+            case DataType::F32:
+            {
+              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+      },
+      in, out);
+}
+
+void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+  const int window_step_x = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  // Create iterators
+  Iterator in(input, win_collapsed);
+  Iterator out(output, win_collapsed);
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+  const auto &qinfo_in = input->info()->quantization_info();
+  const auto &qinfo_out = output->info()->quantization_info();
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
+
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          using from_vector = typename cast_vector<float>::type;
+          const auto vf = wrapper::vloadq(in_ptr + x);
+          const auto vin = vdequantize(vf, qinfo_in);
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              using to_vector = typename cast_vector<uint8_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const auto vf = vcast<to_vector, from_vector>(vin);
+              const auto vout = vquantize(vf, qinfo_out);
+              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::U32:
+            {
+              using to_vector = typename cast_vector<uint32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::S32:
+            {
+              using to_vector = typename cast_vector<int32_t>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+              break;
+            }
+            case DataType::F32:
+            {
+              using to_vector = typename cast_vector<float>::type;
+              const to_vector vout = vcast<to_vector, from_vector>(vin);
+              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          qasymm8_t qval_in = *(in_ptr + x);
+          const auto val = qinfo_in.dequantize(qval_in);
+
+          switch (output->info()->data_type())
+          {
+            case DataType::U8:
+            {
+              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+              break;
+            }
+            case DataType::QASYMM8:
+            {
+              const auto qval_out = qinfo_out.quantize(val, rounding_policy);
+              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
+              break;
+            }
+            case DataType::U32:
+            {
+              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+              break;
+            }
+            case DataType::S32:
+            {
+              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+              break;
+            }
+            case DataType::F32:
+            {
+              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+              break;
+            }
+            default:
+              ARM_COMPUTE_ERROR("Unsupported data type.");
+          }
+        }
+      },
+      in, out);
+}
+} // namespace
+
+NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
+{
+}
+
+void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
+
+  _input = input;
+  _output = output;
+  _input_subtype = input_subtype;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                              SubDataType input_subtype)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+  return Status{};
+}
+
+void NECastKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::U8:
+      if (_input_subtype == SubDataType::BOOL)
+      {
+        run_cast<bool>(_input, _output, window);
+      }
+      else
+      {
+        run_cast<uint8_t>(_input, _output, window);
+      }
+      break;
+    case DataType::QASYMM8:
+      run_cast_qasymm8(_input, _output, window);
+      break;
+    case DataType::U32:
+      run_cast<uint32_t>(_input, _output, window);
+      break;
+    case DataType::S32:
+      run_cast<int32_t>(_input, _output, window);
+      break;
+    case DataType::F32:
+      run_cast<float>(_input, _output, window);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
new file mode 100644
index 000000000..8a2223c26
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
+
+  const DataLayout data_layout = input->data_layout();
+  const int idx_channel =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+                              0);
+  // Validate output if initialized
+  if (output->total_size() != 0)
+  {
+    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                (block_shape * input->tensor_shape()[idx_width]));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                (block_shape * input->tensor_shape()[idx_height]));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _block_shape()
+{
+}
+
+void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
+                                            int32_t block_shape)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
+  // Output auto inizialitation if not yet initialized
+  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+
+  _input = input;
+  _output = output;
+  _block_shape = block_shape;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+  ICPPKernel::configure(win);
+}
+
+Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                             int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+  return Status{};
+}
+
+void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+  const int idx_channel =
+      get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+  const int depth_size = _input->info()->dimension(idx_channel);
+  const int r = (depth_size / (_block_shape * _block_shape));
+  const int element_size = _input->info()->element_size();
+
+  Window slice_out = window.first_slice_window_3D();
+
+  // The slice_out slice does not move
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+  // Main loop for NCHW and NHWC
+  if (_input->info()->data_layout() == DataLayout::NCHW)
+  {
+    Window slice_in = window.first_slice_window_2D();
+    do
+    {
+      Iterator in(_input, slice_in);
+      execute_window_loop(slice_in,
+                          [&](const Coordinates &id) {
+                            const int x = id.x();
+                            const int y = id.y();
+
+                            const int z = id.z() % r;
+                            const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+                            const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+                            Coordinates output_coords{out_x, out_y, z, id[3]};
+                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                          },
+                          in);
+    } while (window.slide_window_slice_2D(slice_in));
+  }
+  else
+  {
+    Window slice_in = window.first_slice_window_3D();
+    do
+    {
+      Iterator in(_input, slice_in);
+      execute_window_loop(slice_in,
+                          [&](const Coordinates &id) {
+                            const int x = id.y();
+                            const int y = id.z();
+
+                            const int z = id.x() % r;
+                            const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+                            const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+                            Coordinates output_coords{z, out_x, out_y, id[3]};
+                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                          },
+                          in);
+    } while (window.slide_window_slice_3D(slice_in));
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
new file mode 100644
index 000000000..cebd614df
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+template <ElementWiseUnaryEx op, typename ScalarType>
+inline ScalarType elementwise_op_scalar(const ScalarType &a)
+{
+  switch (op)
+  {
+    case ElementWiseUnaryEx::NEG:
+      return -a;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+template <ElementWiseUnaryEx op, typename VectorType>
+inline VectorType elementwise_op(const VectorType &a)
+{
+  switch (op)
+  {
+    case ElementWiseUnaryEx::NEG:
+      return wrapper::vneg(a);
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+template <ElementWiseUnaryEx op, typename ScalarType>
+void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+{
+  const int window_step_x = 16 / sizeof(ScalarType);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(in, win);
+  Iterator output(out, win);
+
+  execute_window_loop(win,
+                      [&](const Coordinates &) {
+                        auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                        const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+
+                        int x = window_start_x;
+                        for (; x <= window_end_x - window_step_x; x += window_step_x)
+                        {
+                          wrapper::vstore(output_ptr + x,
+                                          elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+                        }
+                        for (; x < window_end_x; ++x)
+                        {
+                          *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+                        }
+                      },
+                      input, output);
+}
+
+template <ElementWiseUnaryEx op>
+std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+configure_func(const ITensor *input, ITensor *output)
+{
+  std::string function_to_call("op_");
+  function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+  function_to_call += string_from_data_type(output->info()->data_type());
+
+  static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
+      map_function = {
+          {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
+      };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+  auto it = map_function.find(function_to_call);
+
+  if (it != map_function.end())
+  {
+    auto func = it->second;
+    return [func](const ITensor *input, ITensor *output, const Window &window) {
+      func(input, output, window);
+    };
+  }
+  return nullptr;
+}
+} // namespace
+
+NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
+    : _function(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
+                                           ITensor *output)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Configure kernel window
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+  Window win = calculate_max_window(valid_region);
+
+  _input = input;
+  _output = output;
+
+  INEKernel::configure(win);
+
+  switch (op)
+  {
+    case ElementWiseUnaryEx::NEG:
+      _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+}
+
+Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
+                                                      const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
+                                                       DataType::S32);
+
+  // Validate in case of configured output
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+  }
+
+  return Status{};
+}
+
+Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+                                            const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+  return Status{};
+}
+
+void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+  ARM_COMPUTE_ERROR_ON(_function == nullptr);
+  _function(_input, _output, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..5401afea0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
+    : _input(nullptr), _lookups(nullptr), _output(nullptr)
+{
+}
+
+void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output,
+                                        const ITensor *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Auto initialize output if not initialized
+  auto out_shape = input->info()->tensor_shape();
+  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions());
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
+                                         const arm_compute::ITensorInfo *output,
+                                         const arm_compute::ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+    {
+      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+  }
+
+  return Status{};
+}
+
+void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+
+  Window output_window{window};
+  output_window.set(Window::DimX,
+                    Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                      _input->info()->dimension(0)));
+
+  Window out_slice = output_window.first_slice_window_4D();
+  do
+  {
+    Iterator output_it(_output, out_slice);
+
+    execute_window_loop(out_slice,
+                        [&](const Coordinates &id) {
+                          const int32_t lookup = *reinterpret_cast<int32_t *>(
+                              _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+                          Coordinates input_id{id};
+                          input_id.set(lookup_dim, lookup);
+                          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                                 _output->info()->dimension(0) * _output->info()->element_size());
+                        },
+                        output_it);
+
+  } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
new file mode 100644
index 000000000..ce2413dc1
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U> void validate_indices(const ITensor *indices)
+{
+  for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+  {
+    ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+  }
+}
+
+} // namespace
+
+NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {}
+
+template <typename U>
+inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Iterator output_it(_output, window);
+  execute_window_loop(
+      window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices->info()->num_dimensions(), 0);
+
+        U new_index;
+        switch (_indices->info()->num_dimensions())
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+            break;
+          case 2:
+            new_index =
+                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+            break;
+          case 3:
+            new_index = *(
+                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(0, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+template <typename U>
+void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Window output_window{window};
+  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator output_it(_output, output_window);
+  execute_window_loop(
+      output_window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices->info()->num_dimensions(), _axis);
+
+        U new_index;
+        switch (_indices->info()->num_dimensions())
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+            break;
+          case 2:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+            break;
+          case 3:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(_axis, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id),
+                    _input->info()->dimension(0) * _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
+                                 int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  if (_axis < 0)
+  {
+    _axis += input->info()->num_dimensions();
+  }
+  ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+  if (0 == _axis)
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_0_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_0_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  else
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_n_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_n_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  // Output auto initialization if not yet initialized
+  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+  // Create window
+  Window win = calculate_max_window(*output->info(), Steps());
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  INEKernel::configure(win);
+}
+
+Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                                  const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+
+  if (axis < 0)
+  {
+    axis += input->num_dimensions();
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  return Status{};
+}
+
+void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+  (this->*_func)(window, info);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
new file mode 100644
index 000000000..391337bfb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <unordered_map>
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr size_t NOT_HIT = 0xFFFFFFFF;
+} // namespace
+
+NEHashtableLookupKernel::NEHashtableLookupKernel()
+    : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+{
+}
+
+void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys,
+                                        const ITensor *input, ITensor *output, ITensor *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Auto initialize output if not initialized
+  auto out_shape{input->info()->tensor_shape()};
+  out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false);
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  // Auto initialize hits if not initialized
+  auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8);
+
+  INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1));
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0));
+    for (size_t i = 0; i < output->num_dimensions() - 1; ++i)
+    {
+      ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+  }
+
+  // Validate in case of configured hits
+  if (hits->total_size() > 0)
+  {
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1));
+    ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0));
+    ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+  }
+
+  return Status{};
+}
+
+void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  const size_t lookup_dim = _output->info()->num_dimensions() - 1;
+  const int const_0 = _output->info()->data_type() == DataType::QASYMM8
+                          ? _output->info()->quantization_info().offset
+                          : 0;
+
+  std::unordered_map<int32_t, size_t> key_index_map;
+  for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
+  {
+    const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n}));
+    key_index_map[key] = n;
+  }
+  std::vector<size_t> lookup_indices;
+  for (size_t k = 0; k < _lookups->info()->dimension(0); ++k)
+  {
+    const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k}));
+    const auto it = key_index_map.find(key);
+    if (it == key_index_map.end())
+    {
+      lookup_indices.emplace_back(NOT_HIT);
+      *_hits->ptr_to_element({k}) = 0;
+    }
+    else
+    {
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+      if (it->second >= _keys->info()->dimension(0))
+        ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds.");
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+      lookup_indices.emplace_back(it->second);
+      *_hits->ptr_to_element({k}) = 1;
+    }
+  }
+
+  Window output_window{window};
+  output_window.set(Window::DimX,
+                    Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                      _input->info()->dimension(0)));
+
+  Window out_slice = output_window.first_slice_window_4D();
+  do
+  {
+    Iterator output_it(_output, out_slice);
+
+    execute_window_loop(out_slice,
+                        [&](const Coordinates &id) {
+                          const auto lookup = lookup_indices.at(id[lookup_dim]);
+                          if (lookup == NOT_HIT)
+                          {
+                            memset(output_it.ptr(), const_0,
+                                   _output->info()->dimension(0) * _output->info()->element_size());
+                          }
+                          else
+                          {
+                            Coordinates input_id{id};
+                            input_id.set(lookup_dim, lookup);
+                            memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                                   _output->info()->dimension(0) * _output->info()->element_size());
+                          }
+
+                        },
+                        output_it);
+
+  } while (window.slide_window_slice_4D(out_slice));
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
new file mode 100644
index 000000000..1ea77fb5c
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                                 float epsilon, const Window &window)
+{
+  /** NEON vector tag type. */
+  using ExactTagType =
+      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+  // Clear X/Y dimensions on execution window as we handle the planes manually
+  Window win = window;
+  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+  win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  constexpr int window_step_x = 16 / sizeof(T);
+  const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+  const auto channel_idx =
+      get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+  Iterator input_it(input, win);
+  execute_window_loop(
+      win,
+      [&](const Coordinates &id) {
+        Window win_plane = window;
+        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+        Iterator input_plane_it(input, win_plane);
+        Iterator output_plane_it(output, win_plane);
+
+        auto sum_h_w = static_cast<T>(0.f);
+        auto sum_squares_h_w = static_cast<T>(0.f);
+
+        execute_window_loop(
+            win_plane,
+            [&](const Coordinates &) {
+              const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+              auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+              auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+              // Compute S elements per iteration
+              int x = window.x().start();
+              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+              {
+                auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+                vec_sum_squares_h_w =
+                    wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+              }
+
+              auto vec2_sum_h_w =
+                  wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+              auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+                                                         wrapper::vgetlow(vec_sum_squares_h_w));
+              for (int i = 0; i < window_step_x / 4; ++i)
+              {
+                vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+              }
+              sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+              sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+              // Compute left-over elements
+              for (; x < window.x().end(); ++x)
+              {
+                const auto value = *(input_ptr + x);
+                sum_h_w += value;
+                sum_squares_h_w += value * value;
+              }
+            },
+            input_plane_it, output_plane_it);
+
+        const auto mean_h_w = sum_h_w / elements_plane;
+        const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+        auto gamma_val = 1.0f;
+        if (gamma != nullptr)
+        {
+          gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+        }
+        const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+        const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+        auto beta_val = 0.0f;
+        if (beta != nullptr)
+        {
+          beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+        }
+        const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+        execute_window_loop(
+            win_plane,
+            [&](const Coordinates &) {
+              auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+              auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+              // Compute S elements per iteration
+              int x = window.x().start();
+              auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+              {
+                vec_val = wrapper::vloadq(input_ptr + x);
+                vec_val = wrapper::vadd(
+                    wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+                wrapper::vstore(output_ptr + x, vec_val);
+              }
+
+              // Compute left-over elements
+              for (; x < window.x().end(); ++x)
+              {
+                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+              }
+            },
+            input_plane_it, output_plane_it);
+      },
+      input_it);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                  "NHWC data layout is not supported by the kernel directly");
+
+  if (output != nullptr && output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                    "Input and output have different number of channels");
+  }
+
+  if (gamma != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                        gamma->dimension(0),
+                                    "Gamma's size must be the same as size of input's channel");
+  }
+
+  if (beta != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
+                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                        beta->dimension(0),
+                                    "Beta's size must be the same as size of input's channel");
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // We handle the planes manually
+  Window win = calculate_max_window(*input, Steps(1));
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+  // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+      _epsilon(1e-12)
+{
+}
+
+void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output,
+                                                     ITensor *gamma, ITensor *beta, float epsilon)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _input = input;
+  _output = output == nullptr ? input : output;
+  _gamma = gamma;
+  _beta = beta;
+  _epsilon = epsilon;
+
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    _func = &instance_normalization_nchw<float>;
+  }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else if (_input->info()->data_type() == DataType::F16)
+  {
+    _func = &instance_normalization_nchw<float16_t>;
+  }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported data type");
+  }
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
+                                                      const ITensorInfo *output,
+                                                      const ITensorInfo *gamma,
+                                                      const ITensorInfo *beta, float epsilon)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+  return Status{};
+}
+
+void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+  (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
new file mode 100644
index 000000000..de218d489
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  // Checks performed when output is configured
+  if ((output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+inline int32x4x4_t load_value(const int32_t *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+  ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+  wrapper::vstore(ptr, v.val[0]);
+  wrapper::vstore(ptr + 4, v.val[1]);
+  wrapper::vstore(ptr + 8, v.val[2]);
+  wrapper::vstore(ptr + 12, v.val[3]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+  wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+  wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
+{
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+  const float32x4x4_t ret = {{
+      vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+      vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+  }};
+  return ret;
+}
+} // namespace
+
+NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+{
+}
+
+void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor,
+                                            ITensor *output, float multiplier)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+  _multiplier = multiplier;
+
+  // Configure kernel window
+  Window win_config = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  INEKernel::configure(win_config);
+}
+
+Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
+                                             const ITensorInfo *scale_factor,
+                                             const ITensorInfo *output, float multiplier)
+{
+  ARM_COMPUTE_UNUSED(multiplier);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+
+  return Status{};
+}
+
+template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window)
+{
+  constexpr auto window_step = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  // Support Only 2D input
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &id) {
+        auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+        scale *= _multiplier;
+
+        const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+        auto output_ptr = reinterpret_cast<T *>(output.ptr());
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step); x += window_step)
+        {
+          store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+        }
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          output_ptr[x] = input_ptr[x] * scale;
+        }
+      },
+      input, output);
+}
+
+void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_output->info()->data_type())
+  {
+    case DataType::F32:
+      NEMultiplyScaleFactorKernel::multiply<float>(window);
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      NEMultiplyScaleFactorKernel::multiply<float16_t>(window);
+      break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
new file mode 100644
index 000000000..ad1bb9051
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+namespace
+{
+
+/** Conditional element-wise operations */
+enum class ConditionalOperation
+{
+  PRELU, /**< (x * y) for x < 0, x for x >= 0 */
+};
+
+template <ConditionalOperation op, typename ScalarType>
+inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+  auto res = ScalarType(0);
+
+  switch (op)
+  {
+    case ConditionalOperation::PRELU:
+      res = a < 0 ? a * b : a;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <ConditionalOperation op>
+inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
+                                                           QuantizationInfo qinfo)
+{
+  return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP);
+}
+
+template <ConditionalOperation op, typename VectorType>
+inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
+{
+  VectorType res = {0, 0, 0, 0};
+  VectorType const_0 = {0, 0, 0, 0};
+
+  switch (op)
+  {
+    case ConditionalOperation::PRELU:
+      res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
+      ;
+      break;
+    default:
+      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+  }
+  return res;
+}
+
+template <ConditionalOperation op>
+inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+  float32x4x4_t out = {{
+      elementwise_conditional_op<op>(a.val[0], b.val[0]),
+      elementwise_conditional_op<op>(a.val[1], b.val[1]),
+      elementwise_conditional_op<op>(a.val[2], b.val[2]),
+      elementwise_conditional_op<op>(a.val[3], b.val[3]),
+  }};
+  return out;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
+                                                       const ScalarType &broadcast_value,
+                                                       const bool reorder)
+{
+  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+  return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
+                                        reorder ? a : broadcast_vector);
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                           const ScalarType *input1_ptr,
+                                           const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq(input1_ptr + x);
+    const auto b = wrapper::vloadq(input2_ptr + x);
+    wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
+  }
+  return x;
+}
+
+template <ConditionalOperation op>
+inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
+                                                     int window_step_x, const uint8_t *input1_ptr,
+                                                     const uint8_t *input2_ptr, uint8_t *output_ptr,
+                                                     int32x4_t voffset1, int32x4_t voffset2,
+                                                     float32x4_t vscale1, float32x4_t vscale2,
+                                                     float32x4_t voffseto, float32x4_t invvscaleo)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    // Get inputs and compute output
+    const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+    const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+    const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
+    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+  }
+  return x;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
+                                                     int window_step_x,
+                                                     const ScalarType *non_broadcast_input_ptr,
+                                                     const ScalarType &broadcast_value,
+                                                     ScalarType *output_ptr, const bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+    wrapper::vstore(output_ptr + x,
+                    elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
+  }
+  return x;
+}
+
+template <ConditionalOperation op>
+inline int elementwise_conditional_op_quantized_broadcast_loop(
+    int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
+    float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
+    float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+  int x = window_start_x;
+  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+  {
+    const float32x4x4_t af =
+        load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+    const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
+                                                            reorder ? af : broadcast_vector);
+    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+  }
+  return x;
+}
+
+template <ConditionalOperation op, typename ScalarType, typename VectorType>
+void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+                                const Window &window)
+{
+  elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
+                 &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
+                 &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
+}
+
+template <ConditionalOperation op>
+void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
+                                          const Window &window)
+{
+  elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
+                           &elementwise_conditional_op_quantized_broadcast_loop<op>,
+                           &elementwise_conditional_op_quantized_loop<op>);
+}
+} // namespace
+
+NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
+
+  // Configure kernel window
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+  Window win = calculate_max_window(valid_region);
+
+  _input = input;
+  _alpha = alpha;
+  _output = output;
+  INEKernel::configure(win);
+}
+
+void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
+                                                                                _output, window);
+  }
+  else if (_input->info()->data_type() == DataType::QASYMM8)
+  {
+    elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
+                                                                      window);
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Wrong Type");
+  }
+}
+
+Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+                                         const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
+
+  const TensorShape out_shape =
+      TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Checks performed when output is configured
+  if (output.total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
+                               const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
+
+  return Status{};
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
new file mode 100644
index 000000000..acf0092eb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *scale_factor)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  return Status{};
+}
+
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
+          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+  return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+          vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+inline float32x4_t round(const float32x4_t &fv)
+{
+  const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f);
+  const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f);
+  // If value < 0, mask = -1, else mask = 0
+  int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4));
+  return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4));
+}
+
+inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale)
+{
+  const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv);
+  const int32x4_t vposend = vdupq_n_s32(max_scale);
+  const int32x4_t vnagend = vdupq_n_s32(-max_scale);
+
+  const int32x4x4_t rf = {{
+#ifdef __aarch64__
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+      vminq_s32(vposend,
+                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#else  //__aarch64__
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+#endif //__aarch64__
+  }};
+  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+  return vcombine_s8(pa, pb);
+}
+} // namespace
+
+NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
+    : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+{
+}
+
+void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output,
+                                              ITensor *scale_factor)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), output->info(), scale_factor->info()));
+
+  _input = input;
+  _output = output;
+  _scale_factor = scale_factor;
+
+  // Configure kernel window
+  Window win_config = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  INEKernel::configure(win_config);
+}
+
+Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                               const ITensorInfo *scale_factor)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor));
+
+  return Status{};
+}
+
+template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window)
+{
+  constexpr auto window_step = 16;
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+
+#ifdef __aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
+#endif //__aarch64__
+
+  // Collapse window and reset first dimension to handle tail calculations manually
+  // Support Only 2D input
+  Window win_collapsed = window;
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+  const auto dim_x = _input->info()->dimension(0);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &id) {
+        const auto start = reinterpret_cast<const T *>(input.ptr());
+        const auto min_max = std::minmax_element(start, start + dim_x);
+        const auto int8_scale = 127;
+        auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+        if (range == 0)
+        {
+          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+          range = 1;
+        }
+        else
+        {
+          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+        }
+        const auto scale_factor_inv = int8_scale / range;
+
+        auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step); x += window_step)
+        {
+          wrapper::vstore(&output_ptr[x],
+                          vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+        }
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+          quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+          output_ptr[x] = static_cast<int8_t>(quantized);
+        }
+      },
+      input, output);
+}
+
+void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::F32:
+      NEQuantizationSymmetricKernel::quantize<float>(window);
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      NEQuantizationSymmetricKernel::quantize<float16_t>(window);
+      break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
new file mode 100644
index 000000000..59e7d9beb
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+float32x2_t calculate_min(float32x4_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+float32x2_t calculate_max(float32x4_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmax(pmax, pmax);
+}
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+int32x2_t calculate_min(int32x4_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmin(pmin, pmin);
+}
+
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+int32x2_t calculate_max(int32x4_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  return wrapper::vpmax(pmax, pmax);
+}
+
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+inline uint8x8_t calculate_min(uint8x16_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmin = wrapper::vpmin(pmin, pmin);
+  pmin = wrapper::vpmin(pmin, pmin);
+  return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+inline uint8x8_t calculate_max(uint8x16_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmax = wrapper::vpmax(pmax, pmax);
+  pmax = wrapper::vpmax(pmax, pmax);
+  return wrapper::vpmax(pmax, pmax);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+// Helper function to calculate the minimum value of the input vector. All the elements in the
+// output vector contain the min value.
+inline float16x4_t calculate_min(float16x8_t in)
+{
+  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmin = wrapper::vpmin(pmin, pmin);
+  return wrapper::vpmin(pmin, pmin);
+}
+// Helper function to calculate the maximum value of the input vector. All the elements in the
+// output vector contain the max value.
+inline float16x4_t calculate_max(float16x8_t in)
+{
+  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
+  pmax = wrapper::vpmax(pmax, pmax);
+  return wrapper::vpmax(pmax, pmax);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <class F> class Reducer
+{
+public:
+  static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set out window
+    Window out_window(window);
+    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    // Get first input and output slices
+    Window in_slice = window.first_slice_window_1D();
+    Window out_slice = out_window.first_slice_window_1D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), op);
+    } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+  }
+  static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set in window
+    Window in_window(window);
+    Window out_window(window);
+
+    in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1),
+                                                   output->info()->dimension(1)));
+
+    // Get first input and output slices
+    Window in_slice = in_window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), 1, op);
+    } while (in_window.slide_window_slice_2D(in_slice) &&
+             out_window.slide_window_slice_2D(out_slice));
+  }
+  static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set in window
+    Window in_window(window);
+    Window out_window(window);
+
+    in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2),
+                                                   output->info()->dimension(2)));
+
+    // Get first input and output slices
+    Window in_slice = in_window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_3D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), 2, op);
+    } while (in_window.slide_window_slice_3D(in_slice) &&
+             out_window.slide_window_slice_3D(out_slice));
+  }
+  static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f,
+                      const ReduceOperation op)
+  {
+    // Set in/out window
+    Window in_window(window);
+    Window out_window(window);
+
+    in_window.set(3, Window::Dimension(0, 1, 1));
+    out_window.set(3, Window::Dimension(0, 1, 1));
+
+    // Get first input and output slices
+    Window in_slice = in_window.first_slice_window_4D();
+    Window out_slice = out_window.first_slice_window_4D();
+
+    do
+    {
+      Iterator in(input, in_slice);
+      Iterator out(output, out_slice);
+
+      f(in, out, in_slice, out_slice, *input->info(), 3, op);
+    } while (in_window.slide_window_slice_4D(in_slice) &&
+             out_window.slide_window_slice_4D(out_slice));
+  }
+};
+
+template <typename T, int S> struct RedOpX
+{
+  /** NEON vector tag type. */
+  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+    ARM_COMPUTE_UNUSED(in_info);
+    auto init_res_value = static_cast<T>(0.f);
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      case ReduceOperation::MAX:
+      {
+        init_res_value = *reinterpret_cast<T *>(input.ptr());
+        break;
+      }
+      default:
+        break;
+    }
+    auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+
+    execute_window_loop(in_slice,
+                        [&](const Coordinates &) {
+                          const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+                          const auto vec_elements = wrapper::vloadq(in_ptr);
+
+                          switch (op)
+                          {
+                            case ReduceOperation::MIN:
+                            {
+                              vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                              break;
+                            }
+                            case ReduceOperation::MAX:
+                            {
+                              vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                              break;
+                            }
+                            default:
+                              ARM_COMPUTE_ERROR("Not supported");
+                          }
+                        },
+                        input);
+
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      {
+        *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0);
+        break;
+      }
+      case ReduceOperation::MAX:
+      {
+        *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0);
+        break;
+      }
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+  }
+};
+
+struct RedOpX_qasymm8
+{
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+    ARM_COMPUTE_UNUSED(in_info);
+
+    uint8x16_t vec_res_value = {0};
+
+    if (op == ReduceOperation::MIN || op == ReduceOperation::MAX)
+    {
+      vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+    }
+
+    execute_window_loop(in_slice,
+                        [&](const Coordinates &) {
+                          const auto vec_elements = wrapper::vloadq(input.ptr());
+                          switch (op)
+                          {
+                            case ReduceOperation::MIN:
+                            {
+                              vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                              break;
+                            }
+                            case ReduceOperation::MAX:
+                            {
+                              vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                              break;
+                            }
+                            default:
+                              ARM_COMPUTE_ERROR("Not supported");
+                          }
+                        },
+                        input);
+
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      {
+        *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+        break;
+      }
+      case ReduceOperation::MAX:
+      {
+        *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+        break;
+      }
+      default:
+      {
+        ARM_COMPUTE_ERROR("Not supported");
+      }
+    }
+  }
+};
+
+template <typename T, int S> struct RedOpYZW
+{
+  /** NEON vector tag type. */
+  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+  using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
+
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, int axis, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+
+    execute_window_loop(
+        in_slice,
+        [&](const Coordinates &) {
+          neon_vector vec_res_value = {0};
+          switch (op)
+          {
+            case ReduceOperation::MIN:
+            case ReduceOperation::MAX:
+            {
+              vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+              break;
+            }
+            default:
+            {
+              vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+              break;
+            }
+          }
+
+          for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+          {
+            T *in_ptr;
+            switch (axis)
+            {
+              case 1:
+                in_ptr = reinterpret_cast<T *>(
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
+                break;
+              case 2:
+                in_ptr = reinterpret_cast<T *>(
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
+                break;
+              case 3:
+                in_ptr = reinterpret_cast<T *>(
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
+                break;
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+            const auto vec_elements = wrapper::vloadq(in_ptr);
+
+            switch (op)
+            {
+              case ReduceOperation::MIN:
+              {
+                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                break;
+              }
+              case ReduceOperation::MAX:
+              {
+                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                break;
+              }
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+          }
+          wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+        },
+        input, output);
+  }
+};
+
+struct RedOpYZW_qasymm8
+{
+  inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
+                         const TensorInfo &in_info, int axis, const ReduceOperation op)
+  {
+    ARM_COMPUTE_UNUSED(out_slice);
+
+    execute_window_loop(
+        in_slice,
+        [&](const Coordinates &) {
+          auto vec_res_value = wrapper::vloadq(input.ptr());
+
+          for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+          {
+            uint8_t *in_ptr;
+            switch (axis)
+            {
+              case 1:
+                in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
+                break;
+              case 2:
+                in_ptr =
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
+                break;
+              case 3:
+                in_ptr =
+                    input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
+                break;
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+            const auto vec_elements = wrapper::vloadq(in_ptr);
+
+            switch (op)
+            {
+              case ReduceOperation::MIN:
+              {
+                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                break;
+              }
+              case ReduceOperation::MAX:
+              {
+                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                break;
+              }
+              default:
+                ARM_COMPUTE_ERROR("Not supported");
+            }
+          }
+          wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value);
+        },
+        input, output);
+  }
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis,
+               const ReduceOperation op)
+{
+  const bool is_complex = (input->info()->num_channels() == 2);
+  if (is_complex)
+  {
+    ARM_COMPUTE_ERROR("Not supported");
+  }
+
+  switch (axis)
+  {
+    case 0:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output,
+                                                        RedOpX<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+        case DataType::S32:
+          return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(),
+                                                      op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    case 1:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output,
+                                                          RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(),
+                                                      op);
+        case DataType::S32:
+          return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output,
+                                                        RedOpYZW<int32_t, 4>(), op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    case 2:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output,
+                                                          RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(),
+                                                      op);
+        case DataType::S32:
+          return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output,
+                                                        RedOpYZW<int32_t, 4>(), op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    case 3:
+      switch (input->info()->data_type())
+      {
+        case DataType::QASYMM8:
+          return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+          return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output,
+                                                          RedOpYZW<float16_t, 8>(), op);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+          return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(),
+                                                      op);
+        case DataType::S32:
+          return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output,
+                                                        RedOpYZW<int32_t, 4>(), op);
+        default:
+          ARM_COMPUTE_ERROR("Not supported");
+      }
+    default:
+      ARM_COMPUTE_ERROR("Unsupported reduction axis");
+  }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+                          ReduceOperation op)
+{
+  ARM_COMPUTE_UNUSED(op);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+
+  if (input->num_channels() == 1)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex");
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                  "Reduction axis greater than max number of dimensions");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
+
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+    const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                         unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_UNUSED(op);
+
+  // Calculate output shape and set if empty
+  const TensorShape output_shape =
+      arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+
+  // Output auto initialization if not yet initialized
+  DataType output_data_type = input->data_type();
+  auto_init_if_empty(*output, input->clone()
+                                  ->set_tensor_shape(output_shape)
+                                  .set_data_type(output_data_type)
+                                  .reset_padding()
+                                  .set_is_resizable(true));
+
+  unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+NEReductionOperationKernelEx::NEReductionOperationKernelEx()
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX),
+      _border_size()
+{
+}
+
+BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; }
+
+void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output,
+                                             unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  unsigned int num_elems_processed_per_iteration =
+      16 / data_size_from_type(input->info()->data_type());
+
+  _input = input;
+  _output = output;
+  _border_size =
+      (axis == 0)
+          ? BorderSize(0, num_elems_processed_per_iteration -
+                              (input->info()->dimension(0) % num_elems_processed_per_iteration),
+                       0, 0)
+          : BorderSize();
+  _op = op;
+  _reduction_axis = axis;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
+      validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
+
+  return Status{};
+}
+
+void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  reduce_op(window, _input, _output, _reduction_axis, _op);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
new file mode 100644
index 000000000..36a2f55a9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
+
+  // Validate output if initialized
+  if (output->total_size() != 0)
+  {
+    const DataLayout data_layout = input->data_layout();
+    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_channel =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int idx_batch =
+        get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
+                                output->tensor_shape()[idx_batch]);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+                                0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
+                                output->tensor_shape().total_size());
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+} // namespace
+
+NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _block_shape()
+{
+}
+
+void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
+                                            int32_t block_shape)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+
+  _input = input;
+  _block_shape = block_shape;
+  _output = output;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+  INEKernel::configure(win);
+}
+
+Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                             int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+  return Status{};
+}
+
+void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+  const DataLayout data_layout = _input->info()->data_layout();
+  const int channel_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  const int element_size = _input->info()->element_size();
+
+  const size_t channel_size = _input->info()->dimension(channel_idx);
+
+  Window slice_out = window.first_slice_window_3D();
+
+  int batch_id = 0;
+
+  // Main loop for NCHW and NHWC
+  if (_output->info()->data_layout() == DataLayout::NCHW)
+  {
+    do
+    {
+      Iterator out(_output, slice_out);
+      execute_window_loop(slice_out,
+                          [&](const Coordinates &id) {
+                            const size_t channel_id = id.z();
+                            const size_t in_x =
+                                id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+                            const size_t in_y =
+                                id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+                            const int z = channel_id % channel_size;
+                            Coordinates input_coords{in_x, in_y, z, batch_id};
+                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                          },
+                          out);
+      ++batch_id;
+    } while (window.slide_window_slice_3D(slice_out));
+  }
+  else
+  {
+    do
+    {
+      Iterator out(_output, slice_out);
+      execute_window_loop(slice_out,
+                          [&](const Coordinates &id) {
+                            const size_t channel_id = id.x();
+                            const size_t in_x =
+                                id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+                            const size_t in_y =
+                                id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+                            const int z = channel_id % channel_size;
+                            Coordinates input_coords{z, in_x, in_y, batch_id};
+                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                          },
+                          out);
+      ++batch_id;
+    } while (window.slide_window_slice_3D(slice_out));
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp
new file mode 100644
index 000000000..94242b56b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+const std::pair<unsigned int, unsigned int>
+arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                             unsigned int kernel_width, unsigned int kernel_height,
+                                             const PadStrideInfo &info, unsigned int invalid_right,
+                                             unsigned int invalid_bottom)
+{
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+  const unsigned int padx = info.pad_left() + info.pad_right();
+  const unsigned int pady = info.pad_top() + info.pad_bottom();
+
+  ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
+  ARM_COMPUTE_ERROR_ON(kernel_width <= padx);
+  ARM_COMPUTE_ERROR_ON(kernel_height <= pady);
+
+  // Find the transpose conv out dimensions
+  // transpose conv out:
+  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
+  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
+  const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right;
+  const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom;
+
+  return std::make_pair<unsigned int, unsigned int>(w, h);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
new file mode 100644
index 000000000..158fe0b0c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/CLFunctionsEx.h"
+
+// NOTE This empty file aims to validate "CLFunctionsEx.h".
+//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
new file mode 100644
index 000000000..ae64a6edd
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgOperation::CLArgOperation()
+{
+  // DO NOTHING
+}
+
+void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+                               ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
+  _input = input;
+  _output = output;
+  _axis = axis;
+  _arg_op = op;
+  // NOTE The argminmax_axis must have no duplication.
+  _num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _argop_kernels =
+      arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(_axis[i], 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
+            .set_data_layout(input->info()->data_layout()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ArgMinMax on all kernels
+  for (size_t i = 0; i < _num_of_kernels; i++)
+  {
+    _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
+  }
+}
+
+Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+                                const ITensorInfo *output, ArgOperation op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(axis[i], 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate argminmax only on all kernels
+  for (size_t i = 0; i < num_of_kernels; i++)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
+  }
+
+  return Status{};
+}
+
+void CLArgOperation::run()
+{
+  for (size_t i = 0; i < _num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_argop_kernels[i]);
+  }
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                  BinaryLogicalOperation op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 000000000..742fc6f59
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+using namespace arm_compute;
+
+void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+  k->configure(input, output, input_subtype);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+                                  const ICLTensor *lookups)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..c6b166163
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
+
+using namespace arm_compute;
+
+void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
+                                               const arm_compute::ICLTensor *weights,
+                                               const arm_compute::ICLTensor *biases,
+                                               arm_compute::ICLTensor *output, bool needs_reshape,
+                                               const arm_compute::TensorShape &reshape)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  if (_needs_reshape)
+  {
+    // reshape
+    auto_init_if_empty(*_cl_buffer.info(),
+                       _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
+                           _input->info()->data_layout()));
+    _cl_reshape.configure(_input, &_cl_buffer);
+
+    _cl_fc.configure(&_cl_buffer, _weights, _biases, _output);
+
+    // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _cl_buffer.allocator()->allocate();
+  }
+  else
+  {
+    _cl_fc.configure(_input, _weights, _biases, _output);
+  }
+}
+
+void CLFullyConnectedReshapingLayer::run(void)
+{
+  if (_needs_reshape)
+    _cl_reshape.run();
+
+  _cl_fc.run();
+}
+
+void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
new file mode 100644
index 000000000..6cad9bd2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+using namespace arm_compute;
+
+void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+                           int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return CLGatherExKernel::validate(input, indices, output, axis);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                  const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..86ea5a66d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+
+void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                               ICLTensor *gamma, ICLTensor *beta, float epsilon)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  k->configure(input, output, gamma, beta, epsilon);
+  _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                const ITensorInfo *gamma, const ITensorInfo *beta,
+                                                float epsilon)
+{
+  return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+  k->configure(input, alpha, output);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
new file mode 100644
index 000000000..2a34c0664
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+      _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                              const ITensorInfo *hidden_state, const ITensorInfo *output,
+                              const ActivationLayerInfo &info)
+{
+  const int idx_width = 0;
+  const int idx_height = 1;
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+                                      output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+                              recurrent_weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+                              recurrent_weights->dimension(1));
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                     hidden_state->tensor_shape());
+
+  auto shape_info =
+      TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+                 input->data_type());
+
+  ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
+      ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+  return Status{};
+}
+
+void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+                             const ICLTensor *recurrent_weights, const ICLTensor *bias,
+                             ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+  ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
+                                                    recurrent_weights->info(), bias->info(),
+                                                    hidden_state->info(), output->info(), info));
+
+  const int idx_height = 1;
+  TensorShape shape =
+      compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+  _is_prepared = false;
+
+  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+  // Manage intermediate buffers and configure
+  _memory_group.manage(&_fully_connected_out);
+  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+  _memory_group.manage(&_gemm_output);
+  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _memory_group.manage(&_add_output);
+
+  _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
+                        &_add_output, ConvertPolicy::SATURATE);
+
+  _fully_connected_out.allocator()->allocate();
+  _gemm_output.allocator()->allocate();
+
+  _activation_kernel.configure(&_add_output, hidden_state, info);
+  _add_output.allocator()->allocate();
+
+  _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayerEx::run()
+{
+  prepare();
+
+  _memory_group.acquire();
+
+  _fully_connected_kernel.run();
+  _gemm_state_f.run();
+  CLScheduler::get().enqueue(_add_kernel);
+  CLScheduler::get().enqueue(_activation_kernel);
+
+  // copy hidden out to output
+  CLScheduler::get().enqueue(_copy_kernel);
+
+  _memory_group.release();
+}
+
+void CLRNNLayerEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    _fully_connected_kernel.prepare();
+    _gemm_state_f.prepare();
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..13a25c901
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+      _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const std::set<uint32_t> &axis, bool keep_dims,
+                                   const ReduceOperation &op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1, false);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+    interm_tensors[i].set_data_layout(input->data_layout());
+    interm_tensors[i].set_quantization_info(input->quantization_info());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceOperation only on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+  }
+
+  if (!keep_dims)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+  }
+
+  return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+                                  const std::set<uint32_t> &axis, bool keep_dims,
+                                  ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
+
+  _axis = axis;
+
+  _input = input;
+  _output = output;
+  _keep_dims = keep_dims;
+
+  // NOTE The axis must have no duplication.
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels =
+      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ReduceOperation on all kernels
+  TensorShape shape{input->info()->tensor_shape()};
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    shape.set(*it, 1, false);
+    if (!keep_dims || i != (num_of_kernels - 1))
+    {
+      _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+      _memory_group.manage(&_interm_tensors[i]);
+    }
+    _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+    if (i != 0)
+    {
+      _interm_tensors[i - 1].allocator()->allocate();
+    }
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output);
+    _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate();
+  }
+}
+
+void CLReduceOperation::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  const size_t num_of_kernels = _axis.size();
+  for (size_t i = 0; i < num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduce_kernels[i]);
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                 const ICLTensor *padding_size, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+  k->configure(input, block_size, padding_size, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..80d50ad94
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+       _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+       _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+       _reorder_negatives_kernel(), _store_kernel()*/
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                         int total_bits, int bits)
+{
+  _total_bits = total_bits;
+  _bits = bits;
+  _n = input->info()->tensor_shape()[0];
+
+  // _total_bits should be divided by _bits.
+  ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+  _k = k;
+  _radix = 1 << bits;
+
+  _input = input;
+  _values = values;
+  _indices = indices;
+
+  std::string topk_env;
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+  }
+  else if (topk_env == "GPU")
+  {
+    // n should be divided by (_GROUPS * _ITEMS)
+    ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+    _hist_buf_size = _radix * _GROUPS * _ITEMS;
+    _glob_sum_buf_size = _HISTOSPLIT;
+
+    _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _hist_buf_size);
+    _glob_sum_buf =
+        cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                   sizeof(cl_int) * _glob_sum_buf_size);
+    _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _glob_sum_buf_size);
+    _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                                         CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+    _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _p_in_key_buf = &_in_key_buf;
+    _p_out_key_buf = &_out_key_buf;
+    _p_in_ind_buf = &_in_ind_buf;
+    _p_out_ind_buf = &_out_ind_buf;
+
+    _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+    _hist_kernel.configure(&_hist_buf, bits, _n);
+    _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+    _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _reorder_kernel.configure(&_hist_buf, bits, _n);
+    _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+    _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+    _store_kernel.configure(values, indices, k, _n);
+  }
+  else
+#endif // Disable GPU implementation
+  {
+    // DO NOTHING for CPU.
+  }
+}
+
+void CLTopKV2::run()
+{
+  std::string topk_env;
+#if 0
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    run_on_gpu_single_quicksort();
+  }
+  else if (topk_env == "GPU")
+  {
+    run_on_gpu();
+  }
+  else
+#endif
+  {
+    run_on_cpu();
+  }
+}
+
+#if 0
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+  // This is a single threaded quick sort implementation.
+  CLScheduler::get().enqueue(_qs_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  // 1. CLTopKV2Init set key buffer and index buffer.
+  //  - Key buffer is set as the same value of the layer's input
+  //  - Values in the index buffer are set as their indices.
+  CLScheduler::get().enqueue(_init_kernel, false);
+
+  int n_passes = _total_bits / _bits;
+
+  // 2. Repeat (total_bits/bits) times.
+  //   - total_bits is the number of bits of the data type (e.g., 32 for float)
+  //   - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+  for (int pass = 0; pass < n_passes; ++pass)
+  {
+    arm_compute::CLScheduler::get().sync();
+
+    // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+    _hist_kernel.setPass(pass, _p_in_key_buf);
+    CLScheduler::get().enqueue(_hist_kernel, false);
+
+    // 2.2. Calculate prefix sum locally with multiple threads
+    CLScheduler::get().enqueue(_scan_hist_kernel, false);
+    // 2.3. Calculate prefix sum within a work group
+    CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+    // 2.4. Calculate global prefix sum
+    CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+    // 2.5. Reorder keys and indices based on the global prefix sum
+    _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+    CLScheduler::get().enqueue(_reorder_kernel, false);
+
+    cl::Buffer *tmp;
+    // swap key buffers
+    tmp = _p_in_key_buf;
+    _p_in_key_buf = _p_out_key_buf;
+    _p_out_key_buf = tmp;
+
+    // swap index buffers
+    tmp = _p_in_ind_buf;
+    _p_in_ind_buf = _p_out_ind_buf;
+    _p_out_ind_buf = tmp;
+  }
+
+  // 3. Get the first negative index
+  // Because we swap in_buf and out_buf at the end of the above for loop,
+  // the output buffers are in bufs.
+  _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+  CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+  // 4. Correct odering of negatives
+  //   - Since radix sort does not consider negatives, negatives are considered as bigger values
+  //   than positives.
+  // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+  _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+                                       _p_out_ind_buf);
+  CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+  // 5. Extract top k values from sorted keys and indices.
+  _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+  CLScheduler::get().enqueue(_store_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+
+#if 0
+  // below code is left for debugging.
+  int first_neg;
+  q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+  std::cout << "first neg = " << first_neg << std::endl;
+
+  float in_key[_n];
+  q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+  }
+
+  float out_key[_n];
+  q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+  }
+
+  int in_ind[_n];
+  q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+  }
+
+  int out_ind[_n];
+  q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+  }
+
+  int hist_buf[_hist_buf_size];
+  q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+  for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+    std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+  }
+
+  int glob_sum_buf[_glob_sum_buf_size];
+  q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+  for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+    std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+  }
+
+#endif
+}
+#endif // Disable GPU implementation
+
+void CLTopKV2::run_on_cpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+  // const Window& w = _topkv2_kernel.window();
+
+  _input->map(q);
+  _values->map(q);
+  _indices->map(q);
+
+  // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+  int row_size = _input->info()->tensor_shape()[0];
+  int rank = _input->info()->num_dimensions();
+
+  if (rank > 2)
+    throw std::runtime_error("Not supported type.");
+
+  int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+                                         (int32 *)_indices->buffer(), (float *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::S32)
+  {
+    nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (int32_t *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::QASYMM8)
+  {
+    nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (uint8_t *)_values->buffer());
+  }
+  else
+  {
+    throw std::runtime_error("Not supported type.");
+  }
+
+  _input->unmap(q);
+  _values->unmap(q);
+  _indices->unmap(q);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
new file mode 100644
index 000000000..40e21671d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _is_prepared(false)
+{
+}
+
+Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, ITensorInfo *output,
+                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+  const unsigned int kernel_x = weights->dimension(idx_w);
+  const unsigned int kernel_y = weights->dimension(idx_h);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+                                  "invalid_right must be smaller than kernel_x");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+                                  "inner_border_top must be smaller than kernel_y");
+
+  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+                                  "Output's width is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+                                  "Output's height is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+                                  "Output's depth is invalid.");
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(input->clone()
+                                ->set_is_resizable(true)
+                                .reset_padding()
+                                .set_tensor_shape(scale_out_shape)
+                                .set_data_layout(data_layout));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, weights_info));
+
+  return Status{};
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  _original_weights = weights;
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(weights, &_weights_flipped);
+
+  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
+  // added.
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(
+      *output->info(),
+      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _is_prepared = weights_info.retain_internal_weights();
+
+  _memory_group.manage(&_scaled_output);
+
+  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+  // to match output shape
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  // configure scale function
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
+
+  // setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+  _scaled_output.allocator()->allocate();
+}
+
+void CLTransposeConvLayer::run()
+{
+  prepare();
+
+  _memory_group.acquire();
+
+  _scale_f.run();
+  _conv_f.run();
+
+  _memory_group.release();
+}
+
+void CLTransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _weights_flipped.map(true);
+    _original_weights->map(CLScheduler::get().queue(), true);
+    CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+    _weights_flipped.unmap();
+    _original_weights->unmap(CLScheduler::get().queue());
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
new file mode 100644
index 000000000..0ce3e6700
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+    : _upsample(),
+      _output(nullptr)
+{
+}
+
+Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              const BorderSize &inner_border,
+                                              const PadStrideInfo &info)
+{
+  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+                                             const BorderSize &inner_border,
+                                             const PadStrideInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  _output = output;
+  _upsample.configure(input, _output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::run()
+{
+  _output->map(CLScheduler::get().queue(), true);
+  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+  {
+    const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+  }
+  else
+  {
+    memset(_output->buffer(), 0, _output->info()->total_size());
+  }
+  _output->unmap(CLScheduler::get().queue());
+
+  CLScheduler::get().enqueue(_upsample, false);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
new file mode 100644
index 000000000..f8e0ef8a6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
+  k->configure(input, output, info);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
new file mode 100644
index 000000000..80fbf359d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/NEFunctionsEx.h"
+
+// NOTE This empty file aims to validate "NEFunctionsEx.h".
+//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
new file mode 100644
index 000000000..5ba465b61
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+
+template <ReductionOperation OP>
+NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape()
+{
+}
+
+template <ReductionOperation OP>
+Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis,
+                                       const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+
+  TensorShape out_shape = input->tensor_shape();
+  const int input_dims = input->num_dimensions();
+  int axis_local = axis;
+
+  // Convert negative axis
+  axis_local = wrap_around(axis_local, input_dims);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1);
+  out_shape.remove_dimension(axis_local);
+
+  const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+template <ReductionOperation OP>
+void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  int axis_local = axis;
+  const int input_dims = input->info()->num_dimensions();
+
+  // Convert negative axis
+  axis_local = wrap_around(axis_local, input_dims);
+
+  // Perform reduction for axis
+  TensorShape intermediate_shape = input->info()->tensor_shape();
+  intermediate_shape.set(axis_local, 1);
+  auto in = input;
+
+  _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(),
+                                            output->info()->data_type(),
+                                            output->info()->quantization_info()));
+  _memory_group.manage(&_reduced_out);
+  _reduction_kernel.configure(in, axis_local, &_reduced_out, OP);
+
+  // Allocate intermediate tensor
+  _reduced_out.allocator()->allocate();
+
+  // Configure reshape layer if we want to drop the dimensions
+  TensorShape out_shape = input->info()->tensor_shape();
+  out_shape.remove_dimension(axis_local);
+  auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape));
+  _reshape.configure(&_reduced_out, output);
+}
+
+template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _reduction_kernel.run();
+  _reshape.run();
+}
+
+// Supported Specializations
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
new file mode 100644
index 000000000..7c15fc453
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
+#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation COP>
+void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+                                                    ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  k->configure(COP, input1, input2, output);
+  _kernel = std::move(k);
+}
+
+template <BinaryLogicalOperation COP>
+Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+                                                     const ITensorInfo *input2,
+                                                     const ITensorInfo *output)
+{
+  return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+                                         BinaryLogicalOperation op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  k->configure(op, input1, input2, output);
+  _kernel = std::move(k);
+}
+
+Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                          const ITensorInfo *output, BinaryLogicalOperation op)
+{
+  return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
new file mode 100644
index 000000000..f2490e4e8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
+  k->configure(input, output, input_subtype);
+  _kernel = std::move(k);
+}
+
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
+                        SubDataType input_subtype)
+{
+  return NECastKernel::validate(input, output, input_subtype);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
new file mode 100644
index 000000000..db419e3a8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
+  k->configure(input, output, block_shape);
+  _kernel = std::move(k);
+}
+
+Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                       int32_t block_shape)
+{
+  return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
new file mode 100644
index 000000000..a95018a28
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NENegLayer::configure(const ITensor *input, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>();
+  k->configure(ElementWiseUnaryEx::NEG, input, output);
+  _kernel = std::move(k);
+}
+Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
new file mode 100644
index 000000000..00c3ed94f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..d604fedbf
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+
+  return Status{};
+}
+} // namespace
+
+void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+                                                           const ITensorInfo *output)
+{
+  return NETransposeKernel::validate(input, output);
+}
+
+NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+      _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+      _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+      _accumulate_biases(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights,
+                                               ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure gemmlowp function
+  _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights,
+                                            const ITensor *biases, ITensor *output,
+                                            FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _accumulate_biases = false;
+  _original_weights = weights;
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  bool _is_fc_after_conv;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
+                           "NEFullyConnectedHybridLayer does not support after conv");
+  (void)_is_fc_after_conv;
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_output.allocator()->init(
+        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+            compute_transposed_shape(*weights->info())));
+    _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Quantize input
+  _quantized_input.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  _scale_factor.allocator()->init(
+      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+  _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+
+  // GEMM
+  _gemmlowp_output.allocator()->init(
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
+
+  // Multiply scale
+  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+                                   weights->info()->quantization_info().scale);
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+
+  _quantized_input.allocator()->allocate();
+  _scale_factor.allocator()->allocate();
+  _gemmlowp_output.allocator()->allocate();
+}
+
+Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                             const ITensorInfo *biases, const ITensorInfo *output,
+                                             FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *weights_to_use = weights;
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  // Fully Connected layer after a Fully Connected Layer without batches
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+  // Validate quantization kernel
+  const ITensorInfo &quantized_input = TensorInfo(
+      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
+      &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale));
+
+  return Status{};
+}
+
+void NEFullyConnectedHybridLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Quantize input
+  NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY);
+
+  // Run matrix multiply
+  _mm_gemmlowp.run();
+
+  // Multiply scale factor
+  NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY);
+
+  // Accumulate biases if provided
+  if (_accumulate_biases)
+  {
+    NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+  }
+}
+
+void NEFullyConnectedHybridLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      _are_weights_reshaped = true;
+      // We can not release _original_weights because it can be used in other nodes
+    }
+
+    // Prepare GEMM prepare and release unused weights
+    _mm_gemmlowp.prepare();
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..a944f699a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info(input.quantization_info().scale,
+                                                   -input.quantization_info().offset);
+    const QuantizationInfo weights_quantization_info(weights.quantization_info().scale,
+                                                     -weights.quantization_info().offset);
+
+    // Validate gemmlowp function
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
+        &input.clone()->set_quantization_info(input_quantization_info),
+        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
+        &input, &weights, nullptr, &output, 1.f, 0.0f,
+        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+  }
+
+  return Status{};
+}
+} // namespace
+
+NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
+      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
+      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
+                                           ITensor *output)
+{
+  if (_is_quantized)
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+    input->info()->set_quantization_info(
+        QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+    weights->info()->set_quantization_info(
+        QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+    // Configure gemmlowp function
+    _mm_gemmlowp.configure(input, weights, nullptr, output);
+
+    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+    // layers
+    input->info()->set_quantization_info(input_quantization_info);
+    weights->info()->set_quantization_info(weights_quantization_info);
+  }
+  else
+  {
+    // Configure matrix multiply kernel
+    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
+                       GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+  }
+}
+
+void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
+                                                ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(
+      (weights->info()->dimension(1) !=
+       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+  // If the fully connected layer is called after a convolution layer, the input tensor must be
+  // linearized
+
+  // Initialize output tensor for flatten
+  TensorShape shape_flatten = compute_flatten_shape(input->info());
+  _flatten_output.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          shape_flatten));
+
+  // Configure flatten kernel
+  _memory_group.manage(&_flatten_output);
+  _flatten_kernel.configure(input, &_flatten_output);
+
+  // Configure matrix multiply kernel
+  configure_mm(&_flatten_output, weights, output);
+
+  // Allocate the output tensor for flatten once all the configure methods have been called
+  _flatten_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
+                                              ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure matrix multiply kernel
+  configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
+                                        const ITensor *biases, ITensor *output,
+                                        FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_converted = true;
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _is_fc_after_conv = true;
+  _accumulate_biases = false;
+  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+  _original_weights = weights;
+
+  // Configure gemmlowp output
+  if (_is_quantized)
+  {
+    _gemmlowp_output.allocator()->init(
+        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+            DataType::S32));
+  }
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !_is_quantized)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1;
+  }
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_function.configure(weights, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Convert weights if needed
+  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Convert weights
+    _convert_weights.configure(weights_to_use, &_converted_weights_output,
+                               input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+    weights_to_use = &_converted_weights_output;
+    _are_weights_converted = false;
+  }
+
+  ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+  if (_is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    configure_conv_fc(input, weights_to_use, tmp_output);
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    configure_fc_fc(input, weights_to_use, tmp_output);
+  }
+
+  // Configure output stage for asymmetric quantized types
+  if (_is_quantized)
+  {
+    float multiplier = input->info()->quantization_info().scale *
+                       weights->info()->quantization_info().scale /
+                       output->info()->quantization_info().scale;
+    int output_multiplier;
+    int output_shift;
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
+                                                               &output_shift);
+    _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
+                                     output_shift, output->info()->quantization_info().offset);
+    _gemmlowp_output.allocator()->allocate();
+  }
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                         const ITensorInfo *biases, const ITensorInfo *output,
+                                         FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+  bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+  const ITensorInfo &flatten_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_flatten_shape(input)));
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+  const ITensorInfo &converted_weights =
+      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                       : TensorInfo(*reshaped_weights.clone());
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *input_to_use = input;
+  const ITensorInfo *weights_to_use = weights;
+  const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1;
+  }
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Validate convert weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
+        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+    weights_to_use = &converted_weights;
+  }
+
+  if (is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        (weights_to_use->dimension(1) !=
+         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+    // Validate flatten kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    input_to_use = &flatten_input;
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+  }
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+  // Validate output stage for asymmetric quantized types
+  if (is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
+        &gemmlowp_output, biases, output));
+  }
+
+  return Status{};
+}
+
+void NEFullyConnectedLayerEx::run()
+{
+  if (!_is_prepared)
+  {
+    if (!_are_weights_reshaped)
+      _reshape_weights_output.allocator()->allocate();
+    if (!_are_weights_converted)
+      _converted_weights_output.allocator()->allocate();
+    _is_prepared = true;
+  }
+
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Reshape of the weights
+    if (!_are_weights_reshaped)
+    {
+      _reshape_weights_function.run();
+    }
+
+    // Convert weights if needed
+    if (!_are_weights_converted)
+    {
+      _convert_weights.run();
+    }
+
+    // Prepare GEMM prepare
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+  }
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Linearize input if it comes from a convolutional layer
+  if (_is_fc_after_conv)
+  {
+    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+  }
+
+  // Run matrix multiply
+  if (_is_quantized)
+  {
+    _mm_gemmlowp.run();
+  }
+  else
+  {
+    _mm_gemm.run();
+  }
+
+  // Accumulate biases if provided
+  if (_is_quantized)
+  {
+    _gemmlowp_output_stage.run();
+  }
+  else
+  {
+    if (_accumulate_biases)
+    {
+      NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+  }
+}
+
+void NEFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Pointer to current weights
+    const ITensor *cur_weights = _original_weights;
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      cur_weights->mark_as_unused();
+      cur_weights = &_reshape_weights_output;
+      _are_weights_reshaped = true;
+    }
+
+    // Convert weights if needed (happens only once)
+    if (!_are_weights_converted)
+    {
+      _converted_weights_output.allocator()->allocate();
+      _convert_weights.run();
+
+      cur_weights->mark_as_unused();
+      _are_weights_converted = true;
+    }
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    // Prepare GEMM prepare and release unused weights
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+
+    // Release converted weights if unused
+    release_unused(&_reshape_weights_output);
+    release_unused(&_converted_weights_output);
+
+    _is_prepared = true;
+  }
+#endif
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..fcac3c7ae
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h"
+
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+
+using namespace arm_compute;
+
+void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input,
+                                               const arm_compute::ITensor *weights,
+                                               const arm_compute::ITensor *biases,
+                                               arm_compute::ITensor *output, bool needs_reshape,
+                                               const arm_compute::TensorShape &reshape,
+                                               KernelType kernel_type)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  const ITensor *input_to_use = input;
+  if (_needs_reshape)
+  {
+    // reshape
+    auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+    _neon_reshape.configure(_input, &_neon_buffer);
+    input_to_use = &_neon_buffer;
+  }
+
+  _neon_fc = [&]() {
+    if (kernel_type == KernelType::GENERAL)
+    {
+      auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager};
+      fc->configure(input_to_use, _weights, _biases, _output);
+      return std::unique_ptr<arm_compute::IFunction>(fc);
+    }
+    else
+    {
+      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+      bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+                       weights->info()->data_type() == DataType::S8;
+
+      if (is_hybrid)
+      {
+        auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+      else
+      {
+        auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+    }
+  }();
+
+  // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+  if (_needs_reshape)
+  {
+    _neon_buffer.allocator()->allocate();
+  }
+}
+
+void NEFullyConnectedReshapingLayer::run(void)
+{
+  if (_needs_reshape)
+    _neon_reshape.run();
+
+  _neon_fc->run();
+}
+
+void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
new file mode 100644
index 000000000..11794a1ea
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
+      _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+      _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
+      _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
+      _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
+                                               ITensor *output, const GEMMInfo &gemm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+  ARM_COMPUTE_UNUSED(c);
+  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
+      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+
+  const ITensor *matrix_a = a;
+  const ITensor *matrix_b = b;
+  GEMMInfo info = gemm_info;
+
+  // Clear state
+  _mtx_a_reshape_kernel = nullptr;
+  _mtx_b_reshape_kernel = nullptr;
+
+  // Set internal variables
+  _a_offset = a->info()->quantization_info().offset;
+  _b_offset = b->info()->quantization_info().offset;
+  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+  _is_prepared = false;
+  _fused_assembly_path = false;
+  _original_b = b;
+
+  const ITensor *a_to_use = a;
+
+  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+  if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+  {
+    _fuse_output_stage = true;
+    _memory_group.manage(&_mm_result_s32);
+    TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+    _mm_result_s32.allocator()->init(info_mm_result_s32);
+  }
+
+#ifdef __aarch64__
+#if 0  // Can use after arm compute library v19.11
+  switch (a->info()->data_type())
+  {
+    case DataType::QASYMM8:
+    case DataType::QASYMM8_SIGNED:
+    case DataType::U8:
+    case DataType::S8:
+    {
+      if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
+          info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+      {
+        _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+        _fused_assembly_path = _asm_glue.is_configured();
+      }
+      else
+      {
+        _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
+                            gemm_info);
+      }
+      _assembly_path = _asm_glue.is_configured();
+      break;
+    }
+    default:
+    {
+      ARM_COMPUTE_ERROR("Datatype not supported");
+      break;
+    }
+  }
+#endif // 0
+  ARM_COMPUTE_ERROR("aarch64 not supported");
+#endif /* __aarch64__ */
+  if (!(_assembly_path || _run_vector_matrix_multiplication))
+  {
+    matrix_a = &_tmp_a;
+    matrix_b = &_tmp_b;
+
+    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+    // 4.0f) ]
+    TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
+                      a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
+    // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
+    // 16.0f) ]
+    TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
+                      b->info()->quantization_info());
+    _tmp_a.allocator()->init(a_info);
+    _tmp_b.allocator()->init(b_info);
+    _memory_group.manage(&_tmp_a);
+    if (!_reshape_b_only_on_first_run)
+    {
+      _memory_group.manage(&_tmp_b);
+    }
+
+    // Configure interleave kernel
+    {
+      auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+      k->configure(a_to_use, &_tmp_a);
+      _mtx_a_reshape_kernel = std::move(k);
+    }
+
+    // Configure transpose kernel
+    {
+      auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+      k->configure(b, &_tmp_b);
+      _mtx_b_reshape_kernel = std::move(k);
+    }
+  }
+
+  if (!_fused_assembly_path)
+  {
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0)
+    {
+      TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+
+      _vector_sum_col.allocator()->init(info_vector_sum_col);
+      if (!_reshape_b_only_on_first_run)
+      {
+        _memory_group.manage(&_vector_sum_col);
+      }
+
+      // Configure Matrix B reduction kernel
+      _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+      TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
+
+      _vector_sum_row.allocator()->init(info_vector_sum_row);
+      _memory_group.manage(&_vector_sum_row);
+
+      // Configure matrix A reduction kernel
+      _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
+                                        false);
+    }
+
+    if (_fuse_output_stage)
+    {
+      // Configure matrix multiply kernel
+      if (!_assembly_path)
+      {
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+        k->configure(matrix_a, matrix_b, &_mm_result_s32);
+        _mm_kernel = std::move(k);
+      }
+
+      _offset_contribution_output_stage_kernel.configure(
+          &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+          _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+          _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
+          _b_offset, info.gemmlowp_output_stage());
+    }
+    else
+    {
+      // Configure matrix multiply kernel
+      if (!_assembly_path)
+      {
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+        k->configure(matrix_a, matrix_b, output);
+        _mm_kernel = std::move(k);
+      }
+      // Configure offset contribution kernel
+      _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                            _b_offset == 0 ? nullptr : &_vector_sum_row,
+                                            a_to_use->info()->dimension(0), _a_offset, _b_offset);
+    }
+  }
+
+  // Allocate tensors
+  if (!_assembly_path && !_run_vector_matrix_multiplication)
+  {
+    _tmp_a.allocator()->allocate();
+    if (!_reshape_b_only_on_first_run)
+    {
+      _tmp_b.allocator()->allocate();
+    }
+  }
+
+  if (!_fused_assembly_path)
+  {
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+    {
+      _vector_sum_col.allocator()->allocate();
+    }
+
+    if (_b_offset != 0)
+    {
+      _vector_sum_row.allocator()->allocate();
+    }
+  }
+
+  if (_fuse_output_stage)
+  {
+    _mm_result_s32.allocator()->allocate();
+  }
+}
+
+Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+                                                const ITensorInfo *c, const ITensorInfo *output,
+                                                const GEMMInfo &gemm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+      "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+                                  "The product AB is defined only if the number of columns in A is "
+                                  "equal to the number of rows in B");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+                                  "Matrix A already reshaped is not supported");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+                                  "Matrix B already reshaped is not supported");
+
+  GEMMInfo info = gemm_info;
+  const ITensorInfo *matrix_a_info = a;
+  const ITensorInfo *matrix_b_info = b;
+
+  const ITensorInfo *a_to_use = a;
+
+  TensorInfo tmp_a_info{};
+  TensorInfo tmp_b_info{};
+  TensorInfo mm_result_s32_info{};
+
+  int32_t a_offset = a->quantization_info().offset;
+  int32_t b_offset = b->quantization_info().offset;
+
+  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+  if (fuse_output_stage)
+  {
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+  }
+
+  // Check if we need to run the optimized assembly kernel
+  bool run_optimised = false;
+  bool run_optimised_requantized = false;
+  const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+  if (a_to_use->data_type() == DataType::QASYMM8 &&
+      info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+  {
+    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f,
+                                                          reshape_b_only_on_first_run));
+    run_optimised_requantized = run_optimised;
+  }
+  else
+  {
+    run_optimised = bool(NEGEMMAssemblyDispatch::validate(
+        a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f,
+        reshape_b_only_on_first_run));
+  }
+
+  if (run_optimised)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+    if (info.depth_output_gemm3d() != 0)
+    {
+      if (info.reinterpret_input_as_3d())
+      {
+        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+      }
+      else
+      {
+        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+      }
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+                                    "NEGEMM cannot reinterpret the input tensor as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+                                    "NEGEMM cannot reinterpret the output tensor as 3D");
+
+    const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+    if (!run_vector_matrix_multiplication)
+    {
+      matrix_a_info = &tmp_a_info;
+      matrix_b_info = &tmp_b_info;
+
+      // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+      // 4.0f) ]
+      TensorShape shape_tmp_a = a->tensor_shape();
+      shape_tmp_a.set(0, a->dimension(0) * 4);
+      shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+      // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
+      // / 16.0f) ]
+      TensorShape shape_tmp_b = b->tensor_shape();
+      shape_tmp_b.set(0, b->dimension(1) * 16);
+      shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+      // Validate interleave kernel
+      auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+      auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+    }
+  }
+
+  if (!run_optimised_requantized)
+  {
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
+
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if (a_offset != 0)
+    {
+      info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+      // Configure Matrix B reduction kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
+          b, &info_vector_sum_col, a->dimension(0), false));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (b_offset != 0)
+    {
+      info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+      // Configure matrix A reduction kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
+          a_to_use, &info_vector_sum_row, a->dimension(0), false));
+    }
+
+    if (fuse_output_stage)
+    {
+      if (!run_optimised)
+      {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
+            matrix_a_info, matrix_b_info, &mm_result_s32_info));
+      }
+
+      // Validate offset contribution kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
+          &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+          b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
+          info.gemmlowp_output_stage()));
+    }
+    else
+    {
+      if (!run_optimised)
+      {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+      }
+      // Validate offset contribution kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
+          output, a_offset == 0 ? nullptr : &info_vector_sum_col,
+          b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
+    }
+  }
+  return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Reshape inputs
+  if (_mtx_a_reshape_kernel)
+  {
+    NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+  }
+  if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+  {
+    NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+  }
+
+  // Run GEMM
+  if (_asm_glue.is_configured())
+  {
+    _asm_glue.run();
+  }
+  else
+  {
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+  }
+
+  if (!_fused_assembly_path)
+  {
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+      NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+    {
+      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    if (_fuse_output_stage)
+    {
+      // Run offset contribution kernel
+      NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+    }
+    else
+    {
+      // Run offset contribution kernel
+      NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+    }
+  }
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    // Run assembly reshape
+    if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+    {
+      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+      _asm_glue.prepare();
+      _original_b->mark_as_unused();
+    }
+    // Run non-assembly reshape
+    else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+    {
+      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+      // Run reshape kernel and mark original weights tensor as unused
+      _tmp_b.allocator()->allocate();
+      NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+      _original_b->mark_as_unused();
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0 && _reshape_b_only_on_first_run)
+    {
+      _vector_sum_col.allocator()->allocate();
+      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
new file mode 100644
index 000000000..90dabb35a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return NEGatherKernelEx::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
new file mode 100644
index 000000000..624185d2c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+                                  ITensor *output, ITensor *hits)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
+
+Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                   const ITensorInfo *input, const ITensorInfo *output,
+                                   const ITensorInfo *hits)
+{
+  return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..1c2c8f027
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+      _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma,
+                                               ITensor *beta, float epsilon)
+{
+  const DataLayout data_layout = input->info()->data_layout();
+
+  // Configure Kernels
+  _is_nchw = data_layout == DataLayout::NCHW;
+
+  if (!_is_nchw)
+  {
+    _memory_group.manage(&_permuted_input);
+    _memory_group.manage(&_permuted_output);
+
+    // Configure the function to transform the input tensor from NHWC -> NCHW
+    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+    _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+    _permute_output.configure(&_permuted_output, output != nullptr ? output : input,
+                              PermutationVector(2U, 0U, 1U));
+    _permuted_input.allocator()->allocate();
+    _permuted_output.allocator()->allocate();
+  }
+  else
+  {
+    _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+  }
+}
+
+Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                const ITensorInfo *gamma, const ITensorInfo *beta,
+                                                float epsilon)
+{
+  return NEInstanceNormalizationLayerKernelEx::validate(
+      &input->clone()->set_data_layout(DataLayout::NCHW),
+      &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayerEx::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Permute input
+  if (!_is_nchw)
+  {
+    _permute_input.run();
+  }
+
+  NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+  // Permute output
+  if (!_is_nchw)
+  {
+    _permute_output.run();
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
new file mode 100644
index 000000000..1150cef76
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
+
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
+  k->configure(input, alpha, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
new file mode 100644
index 000000000..84411c266
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+      _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                              const ITensorInfo *hidden_state, const ITensorInfo *output,
+                              const ActivationLayerInfo &info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+                                      output);
+
+  const int idx_width = 0;
+  const int idx_height = 1;
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+                              recurrent_weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+                              recurrent_weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                     hidden_state->tensor_shape());
+
+  auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
+                                   recurrent_weights, hidden_state->dimension(idx_height)),
+                               1, input->data_type());
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
+      &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+  return Status{};
+}
+
+void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
+                             const ITensor *recurrent_weights, const ITensor *bias,
+                             ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+  ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
+                                                    recurrent_weights->info(), bias->info(),
+                                                    hidden_state->info(), output->info(), info));
+
+  const int idx_height = 1;
+  TensorShape shape = misc::shape_calculator::compute_rnn_shape(
+      recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+  _is_prepared = false;
+
+  // Manage intermediate buffers and configure
+  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+  // Manage intermediate buffers and configure
+  _memory_group.manage(&_fully_connected_out);
+  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+  _memory_group.manage(&_gemm_output);
+  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _memory_group.manage(&_add_output);
+
+  _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
+                        ConvertPolicy::SATURATE);
+
+  _fully_connected_out.allocator()->allocate();
+  _gemm_output.allocator()->allocate();
+
+  _activation_kernel.configure(&_add_output, hidden_state, info);
+  _add_output.allocator()->allocate();
+
+  _copy_kernel.configure(hidden_state, output);
+}
+
+void NERNNLayerEx::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _fully_connected_kernel.run();
+
+  _gemm_state_f.run();
+
+  NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+  NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+
+  // copy hidden out to output
+  NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+
+void NERNNLayerEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    _fully_connected_kernel.prepare();
+    _gemm_state_f.prepare();
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
new file mode 100644
index 000000000..c65e93570
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                                bool keep_dims, const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                               ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels =
+      arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+  _reduced_outs =
+      arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+                                   : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info())
+                                             .set_data_layout(output->info()->data_layout()));
+      _memory_group.manage(_reduced_outs.get() + i);
+      _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+                                      ReductionOperation::MEAN_SUM);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+  }
+}
+
+void NEReduceMeanEx::run()
+{
+  _memory_group.acquire();
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+  _memory_group.release();
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
new file mode 100644
index 000000000..b36f8287a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                                   bool keep_dims, const ITensorInfo *output, ReduceOperation op)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                                  ITensor *output, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels.resize(_reduction_ops);
+  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape =
+        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], op);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info()));
+      _memory_group.manage(&_reduced_outs[i]);
+      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+  }
+}
+
+void NEReduceOperation::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
new file mode 100644
index 000000000..3c18217ef
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                             bool keep_dims, const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                            ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels.resize(_reduction_ops);
+  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape =
+        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info())
+                                             .set_data_layout(input->info()->data_layout()));
+      _memory_group.manage(&_reduced_outs[i]);
+      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
+                                      ReductionOperation::SUM);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+  }
+}
+
+void NEReduceSum::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
new file mode 100644
index 000000000..c3431c418
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+  switch (axis)
+  {
+    case 0:
+      return Window::DimY;
+    case 1:
+    case 2:
+    case 3:
+      return Window::DimX;
+    default:
+      ARM_COMPUTE_ERROR("Unsupported reduction axis");
+  }
+}
+} // namespace
+
+NEReductionOperationEx::NEReductionOperationEx()
+    : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+{
+}
+
+Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op));
+
+  return Status{};
+}
+
+void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis,
+                                       ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      NEReductionOperationEx::validate(input->info(), output->info(), axis, op));
+
+  // Configure reduction kernel
+  _reduction_kernel.configure(input, output, axis, op);
+  _window_split = reduction_window_split_dimension(axis);
+  _reduction_axis = axis;
+
+  if (axis == 0)
+  {
+    // Configure fill border kernel
+    const BorderSize fill_border_size = _reduction_kernel.border_size();
+    PixelValue pixelValue;
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      {
+        switch (input->info()->data_type())
+        {
+          case DataType::F32:
+          {
+            pixelValue = PixelValue(std::numeric_limits<float>::max());
+            break;
+          }
+          case DataType::F16:
+          {
+            pixelValue = PixelValue(static_cast<half>(65504.0f));
+            break;
+          }
+          case DataType::QASYMM8:
+          {
+            pixelValue =
+                PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
+            break;
+          }
+          default:
+          {
+            ARM_COMPUTE_ERROR("Unsupported DataType");
+          }
+        }
+        break;
+      }
+      case ReduceOperation::MAX:
+      {
+        switch (input->info()->data_type())
+        {
+          case DataType::F32:
+          {
+            pixelValue = PixelValue(-std::numeric_limits<float>::max());
+            break;
+          }
+          case DataType::F16:
+          {
+            pixelValue = PixelValue(static_cast<half>(-65504.0f));
+            break;
+          }
+          case DataType::QASYMM8:
+          {
+            pixelValue =
+                PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+            break;
+          }
+          default:
+          {
+            ARM_COMPUTE_ERROR("Unsupported DataType");
+          }
+        }
+        break;
+      }
+      default:
+        ARM_COMPUTE_ERROR("Reduction Operation unsupported");
+    }
+    _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+  }
+}
+
+void NEReductionOperationEx::run()
+{
+  if (_reduction_axis == 0)
+  {
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+  }
+  NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
new file mode 100644
index 000000000..c9f914fb0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
+                                      const ITensor *paddings, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+  {
+    _has_padding = true;
+    _memset_kernel.configure(
+        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+  }
+  _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
+                                      const int block_shape_y, const Size2D &padding_left,
+                                      const Size2D &padding_right, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+  {
+    _has_padding = true;
+    _memset_kernel.configure(
+        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+  }
+  _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                   output);
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+                                       const ITensorInfo *paddings, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+  return Status{};
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
+                                       const int block_shape_y, const Size2D &padding_left,
+                                       const Size2D &padding_right, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
+      input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+  return Status{};
+}
+
+void NESpaceToBatchLayerEx::run()
+{
+  // Zero out output only if we have paddings
+  if (_has_padding)
+  {
+    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+  }
+  NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
new file mode 100644
index 000000000..b6ae21cc0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
+  k->configure(input, output, block_shape);
+  _kernel = std::move(k);
+}
+
+Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                       int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
+  return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
new file mode 100644
index 000000000..fd15ef05f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _conv_f(),
+      _upsample_f(),
+      _flip_weights(),
+      _permute_input(),
+      _permute_weights(),
+      _permute_output(),
+      _scaled_output(),
+      _weights_flipped(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_output(),
+      _is_nchw(false),
+      _original_weights(nullptr),
+      _input(nullptr),
+      _info(),
+      _is_prepared(false)
+{
+}
+
+Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, const ITensorInfo *output,
+                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+                                                       DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+  const unsigned int width_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+  const unsigned int height_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+      weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+  if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+  }
+  else if (bias)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+  }
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
+                                    "Output's dim 0 is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
+                                    "Output's dim 1 is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
+                                    "Output's dim 2 is invalid.");
+  }
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(
+      input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+  scale_out_info.set_data_layout(input->data_layout());
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  const unsigned int batches_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+  const unsigned int channel_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
+                              scale_out_info.dimension(batches_idx));
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
+                              scale_out_info.dimension(channel_idx));
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, WeightsInfo()));
+
+  return Status{};
+}
+
+void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias,
+                                     ITensor *output, const PadStrideInfo &info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  _input = input;
+  _original_weights = weights;
+  _info = info;
+  _is_prepared = false;
+  _is_nchw = data_layout == DataLayout::NCHW;
+
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const unsigned int width_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const unsigned int height_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+      weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+      invalid_right, invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _memory_group.manage(&_scaled_output);
+
+  if (!_is_nchw)
+  {
+    _memory_group.manage(&_permuted_input);
+    _memory_group.manage(&_permuted_weights);
+    _memory_group.manage(&_permuted_output);
+
+    // Configure the function to transform the input tensor from NHWC -> NCHW
+    _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
+    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+    // Configure the function to transform the weights tensor from NHWC -> NCHW
+    _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
+    _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+    _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+    // order to match output shape
+
+    unsigned int pad_left = 0;
+    unsigned int pad_right = 0;
+    unsigned int pad_top = 0;
+    unsigned int pad_bottom = 0;
+    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+        *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
+        invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
+                              _permuted_input.info()->quantization_info());
+    scale_out_info.set_data_layout(DataLayout::NCHW);
+    _scaled_output.allocator()->init(scale_out_info);
+
+    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                      DimensionRoundingType::CEIL);
+    _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
+
+    _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
+    _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
+    _flip_weights.configure(&_permuted_weights, &_weights_flipped);
+
+    // setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    const auto out_shape = output->info()->tensor_shape();
+    TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
+    TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
+                                 output->info()->quantization_info());
+    _permuted_output.allocator()->init(permuted_out_info);
+    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
+
+    // Configure the function to transform the convoluted output to NHWC
+    _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+    _permuted_input.allocator()->allocate();
+    _permuted_weights.allocator()->allocate();
+    _permuted_output.allocator()->allocate();
+  }
+  else
+  {
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+    // order to match output shape
+    unsigned int pad_left = 0;
+    unsigned int pad_right = 0;
+    unsigned int pad_top = 0;
+    unsigned int pad_bottom = 0;
+    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+        *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+        pad_right, pad_top, pad_bottom);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                              input->info()->quantization_info());
+    _scaled_output.allocator()->init(scale_out_info);
+    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                      DimensionRoundingType::FLOOR);
+    _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+    _flip_weights.configure(weights, &_weights_flipped);
+
+    // setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+  }
+  _scaled_output.allocator()->allocate();
+}
+
+void NETransposeConvLayer::run()
+{
+  prepare();
+
+  // MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Permute input
+  if (!_is_nchw)
+  {
+    _permute_input.run();
+  }
+
+  _upsample_f.run();
+  _conv_f.run();
+
+  // Permute output
+  if (!_is_nchw)
+  {
+    _permute_output.run();
+  }
+}
+
+void NETransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    // Permute weights
+    if (!_is_nchw)
+    {
+      _permute_weights.run();
+    }
+    NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
new file mode 100644
index 000000000..67e1bfb02
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/GenericGather.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+  return (input->num_dimensions() != 4 && output->num_dimensions() == 4 &&
+          input->data_layout() == DataLayout::NCHW);
+}
+
+void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                              arm_compute::ITensor *output, int axis)
+{
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  arm_compute::PermutationVector pv;
+  if (shouldPermute(input->info(), output->info()))
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H /
+    // C
+    //
+    //     Original | Permuted
+    // 0 | C        | W (from 1)
+    // 1 | W        | H (from 2)
+    // 2 | H        | C (from 0)
+    //
+    pv = arm_compute::PermutationVector{1, 2, 0};
+  }
+
+  if (utils::isGpuMode())
+  {
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis);
+      _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv);
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+void GenericGather::run(void)
+{
+  if (utils::isGpuMode())
+  {
+    _cl_gather.run();
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _cl_permute.run();
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp
new file mode 100644
index 000000000..8025ae28e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/misc/functions/GenericReshapeLayer.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+namespace
+{
+
+bool shouldPermute(const arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+  return (input->num_dimensions() == 4 || output->num_dimensions() == 4) &&
+         (input->num_dimensions() != output->num_dimensions() &&
+          input->data_layout() == DataLayout::NCHW);
+}
+
+} // namespace
+
+void GenericReshapeLayer::configure(const arm_compute::ITensor *input, arm_compute::ITensor *output)
+{
+  _input = input;
+  _output = output;
+
+  arm_compute::PermutationVector pv;
+  if (input->info()->data_layout() == DataLayout::NCHW && input->info()->num_dimensions() == 4 &&
+      output->info()->num_dimensions() != 4)
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape W / H / C into another tensor of shape
+    // C / W / H
+    //
+    //     Original | Permuted
+    // 0 | W        | C (from 2)
+    // 1 | H        | W (from 0)
+    // 2 | C        | H (from 1)
+    //
+    pv = arm_compute::PermutationVector{2, 0, 1};
+  }
+  else if (input->info()->data_layout() == DataLayout::NCHW &&
+           input->info()->num_dimensions() != 4 && output->info()->num_dimensions() == 4)
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape C / W / H into another tensor of shape
+    // W / H / C
+    //
+    //     Original | Permuted
+    // 0 | C        | W (from 1)
+    // 1 | W        | H (from 2)
+    // 2 | H        | C (from 0)
+    //
+    pv = arm_compute::PermutationVector{1, 2, 0};
+  }
+
+  if (utils::isGpuMode())
+  {
+    const auto const_input = CAST_CL(const_cast<arm_compute::ITensor *>(input));
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _cl_permute.configure(const_input, &_cl_permuted, pv);
+      _cl_reshape.configure(&_cl_permuted, CAST_CL(output));
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _cl_reshape.configure(const_input, CAST_CL(output));
+    }
+  }
+  else
+  {
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _neon_permute.configure(input, &_neon_permuted, pv);
+      _neon_reshape.configure(&_neon_permuted, output);
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _neon_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _neon_reshape.configure(input, output);
+    }
+  }
+}
+
+void GenericReshapeLayer::run(void)
+{
+  if (utils::isGpuMode())
+  {
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _cl_permute.run();
+    }
+    _cl_reshape.run();
+  }
+  else
+  {
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _neon_permute.run();
+    }
+    _neon_reshape.run();
+  }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp
new file mode 100644
index 000000000..44a4bb9ed
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace utils
+{
+
+bool isGpuMode()
+{
+  char *neon = std::getenv("NEON");
+  if (neon == nullptr)
+    return true;
+  else if (neon[0] == '1')
+    return false;
+  return true;
+}
+
+} // namespace utils
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h
new file mode 100644
index 000000000..f94effea1
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/topk_v2.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file topk_v2.h
+ * @brief This file contains TopK method and TopContainer class for TopK operation
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+
+typedef int32_t int32;
+
+namespace nnfw
+{
+namespace rt
+{
+namespace optimized_ops
+{
+/**
+ * @brief class to define TopK operation
+ * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+ * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+ * TFLite.
+ * (TFLite additionaly supports kTfLiteInt64.)
+ *
+ * The class that collects top indexes of k values. Based on template
+ * tensorflow::gtl::TopN<> but, for optimization,
+ * it re-uses the same container.
+ */
+template <typename T> class TopContainer
+{
+public:
+  /**
+   * @brief Prevent default constructor of of this class
+   */
+  TopContainer() = delete;
+  /**
+   * @brief Constructor with params
+   * @param [in] row_size Size of row in data
+   * @param [in] k The top k predictions
+   */
+  TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
+  {
+    container_.reserve(std::min(k, row_size) + 1);
+  }
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   */
+  TopContainer(const TopContainer &) = delete;
+  /*
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   * @return Reference of TopContainer
+   */
+  TopContainer &operator=(const TopContainer &) = delete;
+
+  /**
+   * @brief Start collecting
+   * @param [in] values To set as values
+   * @return N/A
+   */
+  void start_collecting(const T *values)
+  {
+    values_ = values;
+    container_.clear();
+  }
+
+  /**
+   * @brief Push a value to be compared for topk
+   * @param [in] a A value to compare
+   * @return N/A
+   */
+  void push(int32 a)
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)k_)
+    {
+      container_.push_back(a);
+      if (container_.size() == (size_t)(k_ + 1))
+      {
+        std::make_heap(container_.begin(), container_.end(), comparator);
+        std::pop_heap(container_.begin(), container_.end(), comparator);
+      }
+    }
+    else if (comparator(a, container_.front()))
+    {
+      container_.back() = a;
+      std::push_heap(container_.begin(), container_.end(), comparator);
+      std::pop_heap(container_.begin(), container_.end(), comparator);
+    }
+  }
+
+  /**
+   * @brief Get sorted result from pushed values
+   * @return Reference of vector with sorted values
+   */
+  const std::vector<int32> &sorted_result()
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)(k_))
+    {
+      std::sort(container_.begin(), container_.end(), comparator);
+    }
+    else
+    {
+      std::sort_heap(container_.begin(), container_.end() - 1, comparator);
+      container_.resize(k_);
+    }
+    return container_;
+  }
+
+private:
+  int32 k_;
+  std::vector<int32> container_;
+  const T *values_ = nullptr;
+
+  bool compare_fun(int32 a, int32 b) const
+  {
+    if (values_[b] < values_[a])
+    {
+      return true;
+    }
+    else if (values_[b] > values_[a])
+    {
+      return false;
+    }
+    else
+    {
+      return a < b;
+    }
+  }
+};
+
+/**
+ * @brief Operates TopK operation with params
+ * @param [in] row_size Size of row in data
+ * @param [in] num_rows The number of rows in data
+ * @param [in] data To be operated in
+ * @param [in] k The top k predictions
+ * @param [out] output_indexes Indexes of targets in the top k predictions
+ * @param [out] output_values Values of targets in the top k predictions
+ * @return N/A
+ */
+template <typename T>
+void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
+          T *output_values)
+{
+  TopContainer<T> topc(k, row_size);
+  for (int row = 0; row < num_rows; ++row)
+  {
+    const T *values_row = data + row * row_size;
+    topc.start_collecting(values_row);
+    for (int32 c = 0; c < row_size; ++c)
+    {
+      topc.push(c);
+    }
+
+    // Prepare output buffers.
+    int32 *indexes_row = output_indexes + row * k;
+    T *output_row = output_values + row * k;
+    // We always assume that the output is sorted.
+    const auto &top_k = topc.sorted_result();
+    std::copy(top_k.begin(), top_k.end(), indexes_row);
+    std::transform(top_k.begin(), top_k.end(), output_row,
+                   [values_row](const int32 loc) { return values_row[loc]; });
+  }
+}
+
+} // namespace optimized_ops
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__