153 files changed, 10265 insertions, 2864 deletions
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index 026487077..e4e752ef9 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -14,6 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file      CLKernelLibraryEx.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file is a cloned version of CLKernelLibrary.h in ACL. This file defines
+ *            an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL.
+ */
+
 #ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
 #define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__
 
@@ -27,58 +35,76 @@
 namespace arm_compute
 {
 
-/** CLKernelLibrary class */
+/**
+ * @brief Class to build OpenCL kernels added from nnfw
+ * */
 class CLKernelLibraryEx
 {
   using StringSet = std::set<std::string>;
 
 private:
-  /** Default Constructor. */
+  /**
+   * @brief Construct a new CLKernelLibraryEx object
+   */
   CLKernelLibraryEx();
 
 public:
-  /** Prevent instances of this class from being copied */
+  /**
+   * @brief Prevent instances of this class from being copied.
+   */
   CLKernelLibraryEx(const CLKernelLibraryEx &) = delete;
-  /** Prevent instances of this class from being copied */
+
+  /**
+   * @brief Prevent instances of this class from being copied.
+   */
   const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete;
-  /** Access the KernelLibrary singleton.
-   * @return The KernelLibrary instance.
+
+  /**
+   * @brief Get the KernelLibrary singleton.
+   * @return The KernelLibrary instance
    */
   static CLKernelLibraryEx &get();
-  /** Initialises the kernel library.
-   *
-   * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
-   * @param[in] context     (Optional) CL context used to create programs.
-   * @param[in] device      (Optional) CL device for which the programs are created.
-   */
-  void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(),
-            cl::Device device = cl::Device::getDefault())
+
+  /**
+   * @brief Initialise the kernel library.
+   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+   * @param[in] context     CL context used to create programs.
+   * @param[in] device      CL device for which the programs are created.
+   * @return N/A
+   */
+  void init(std::string kernel_path, cl::Context context, cl::Device device)
   {
     _kernel_path = std::move(kernel_path);
     _context = std::move(context);
     _device = std::move(device);
   }
-  /** Sets the path that the kernels reside in.
-   *
-   * @param[in] kernel_path Path of the kernel.
+
+  /**
+   * @brief Set the path that the kernels reside in.
+   * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+   * @return N/A
    */
   void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; };
-  /** Gets the path that the kernels reside in.
+
+  /**
+   * @brief Get the path that the kernels reside in.
+   * @return the path of kernel files
    */
   std::string get_kernel_path() { return _kernel_path; };
-  /** Gets the source of the selected program.
-   *
+
+  /**
+   * @brief Get the source of the selected program.
    * @param[in] program_name Program name.
-   *
    * @return Source of the selected program.
    */
   std::string get_program_source(const std::string &program_name);
-  /** Sets the CL context used to create programs.
-   *
+
+  /**
+   * @brief Set the CL context used to create programs.
    * @note Setting the context also resets the device to the
    *       first one available in the new context.
-   *
    * @param[in] context A CL context.
+   * @return N/A
    */
   void set_context(cl::Context context)
   {
@@ -102,42 +128,56 @@ public:
     }
   }
 
-  /** Accessor for the associated CL context.
-   *
+  /**
+   * @brief Return associated CL context.
    * @return A CL context.
    */
   cl::Context &context() { return _context; }
 
-  /** Sets the CL device for which the programs are created.
-   *
+  /**
+   * @brief Set the CL device for which the programs are created.
    * @param[in] device A CL device.
+   * @return N/A
    */
   void set_device(cl::Device device) { _device = std::move(device); }
 
-  /** Return the device version
-   *
+  /**
+   * @brief Gets the CL device for which the programs are created.
+   * @return A CL device.
+   */
+  cl::Device &get_device() { return _device; }
+
+  /**
+   * @brief Return the device version
    * @return The content of CL_DEVICE_VERSION
    */
   std::string get_device_version();
-  /** Creates a kernel from the kernel library.
-   *
+
+  /**
+   * @brief Create a kernel from the kernel library.
    * @param[in] kernel_name       Kernel name.
    * @param[in] build_options_set Kernel build options as a set.
-   *
    * @return The created kernel.
    */
   Kernel create_kernel(const std::string &kernel_name,
                        const StringSet &build_options_set = {}) const;
-  /** Find the maximum number of local work items in a workgroup can be supported for the kernel.
-   *
+
+  /**
+   * @brief Find the maximum number of local work items in a workgroup can be supported for the
+   * kernel.
+   * @param[in] kernel       kernel object
    */
+
   size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
-  /** Return the default NDRange for the device.
-   *
+  /**
+   * @brief Return the default NDRange for the device.
+   * @return default NDRangeof the device
    */
   cl::NDRange default_ndrange() const;
 
-  /** Clear the library's cache of binary programs
+  /**
+   * @brief Clear the library's cache of binary programs
+   * @return N/A
    */
   void clear_programs_cache()
   {
@@ -145,29 +185,45 @@ public:
     _built_programs_map.clear();
   }
 
-  /** Access the cache of built OpenCL programs */
+  /**
+   * @brief Access the cache of built OpenCL programs
+   * @return program map data structure of which key is name of kernel and value is
+   *         kerel source name. (*.cl)
+   */
   const std::map<std::string, cl::Program> &get_built_programs() const
   {
     return _built_programs_map;
   }
 
-  /** Add a new built program to the cache
-   *
+  /**
+   * @brief Add a new built program to the cache
    * @param[in] built_program_name Name of the program
    * @param[in] program            Built program to add to the cache
+   * @return N/A
    */
   void add_built_program(const std::string &built_program_name, cl::Program program);
 
+  /**
+   * @brief Returns true if FP16 is supported by the CL device
+   * @return true if the CL device supports FP16
+   */
+  bool fp16_supported() const;
+
+  /**
+   * @brief Returns true if int64_base_atomics extension is supported by the CL device
+   * @return true if the CL device supports int64_base_atomics extension
+   */
+  bool int64_base_atomics_supported() const;
+
 private:
-  /** Load program and its dependencies.
-   *
+  /**
+   * @brief Load program and its dependencies.
    * @param[in] program_name Name of the program to load.
    */
   const Program &load_program(const std::string &program_name) const;
-  /** Concatenates contents of a set into a single string.
-   *
+  /**
+   * @brief Concatenates contents of a set into a single string.
    * @param[in] s Input set to concatenate.
-   *
    * @return Concatenated string.
    */
   std::string stringify_set(const StringSet &s) const;
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h
new file mode 100644
index 000000000..dbda354d6
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/OpenCLEx.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OPENCLEX_H__
+#define __ARM_COMPUTE_OPENCLEX_H__
+
+#include <string>
+#include <utility>
+
+/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */
+#ifndef ARM_COMPUTE_NO_EXCEPTIONS
+#define CL_HPP_ENABLE_EXCEPTIONS
+#endif // ARM_COMPUTE_NO_EXCEPTIONS
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#include <CL/cl2.hpp>
+
+namespace arm_compute
+{
+/** Class for loading OpenCL symbols. */
+class CLSymbolsEx final
+{
+private:
+  CLSymbolsEx() = default;
+  void load_symbols(void *handle);
+
+public:
+  /** Get the static instance of CLSymbols.
+   *
+   * @return The static instance of CLSymbols.
+   */
+  static CLSymbolsEx &get();
+  /** Load symbols from the given OpenCL library path.
+   *
+   * @param[in] library Path to the OpenCL library.
+   *
+   * @return True if loading the library is successful.
+   */
+  bool load(const std::string &library);
+  /** Load symbols from any of the default OpenCL library names.
+   *
+   * @return True if loading any library is successful.
+   */
+  bool load_default();
+
+#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr
+
+  DECLARE_FUNCTION_PTR(clGetEventInfo);
+  DECLARE_FUNCTION_PTR(clSetEventCallback);
+
+#undef DECLARE_FUNCTION_PTR
+
+private:
+  std::pair<bool, bool> _loaded{false, false};
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_OPENCLEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h
new file mode 100644
index 000000000..080cc47ef
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLActivationLayerExKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the activation layer kernel. */
+class CLActivationLayerExKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLActivationLayerExKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLActivationLayerExKernel(const CLActivationLayerExKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLActivationLayerExKernel &operator=(const CLActivationLayerExKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLActivationLayerExKernel(CLActivationLayerExKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLActivationLayerExKernel &operator=(CLActivationLayerExKernel &&) = default;
+  /** Default destructor */
+  ~CLActivationLayerExKernel() = default;
+  /** Set the input and output tensor.
+   *
+   * @note If the output tensor is a nullptr, the activation function will be performed in-place
+   *
+   * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will
+   * store the result
+   *                          of the activation function. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out]     output   Destination tensor. Data type supported: same as @p input
+   * @param[in]      act_info Activation layer information.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLActivationLayerKernel
+   *
+   * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor
+   * will store the result
+   *                     of the activation function. Data types supported: QASYMM8/F16/F32.
+   * @param[in] output   Destination tensor info. Data type supported: same as @p input
+   * @param[in] act_info Activation layer information.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ActivationLayerInfoEx &act_info);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  bool _run_in_place;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLACTIVATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h
new file mode 100644
index 000000000..b91a26159
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgMinMaxKernel.h
+ * @brief This file defines CLArgMinMaxKernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__
+#define __ARM_COMPUTE_CLARG_MIN_MAX_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the argminmax max kernel.
+ */
+class CLArgMinMaxKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Default constructor.
+   */
+  CLArgMinMaxKernel();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied
+   */
+  CLArgMinMaxKernel(const CLArgMinMaxKernel &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLArgMinMaxKernel to be copied
+   * @return Reference of this instance
+   */
+  CLArgMinMaxKernel &operator=(const CLArgMinMaxKernel &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved
+   */
+  CLArgMinMaxKernel(CLArgMinMaxKernel &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLArgMinMaxKernel to be moved
+   * @return Reference of this instance
+   */
+  CLArgMinMaxKernel &operator=(CLArgMinMaxKernel &&) = default;
+  /**
+   * @brief Initialise the kernel's input, output and border mode.
+   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output         The output tensor, Data types supported: same as @p input.
+   * @param[in]  argminmax_axis    Axis to argminmax
+   * return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t argminmax_axis,
+                 ArgOperation op);
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   * CLArgMinMaxKernel
+   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+   * @param[in] argminmax_axis     Axis to argminmax
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const uint32_t argminmax_axis, ArgOperation op);
+
+  /*
+   * @brief Run CLArgMinMaxKernel op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+  /*
+   * @brief Run CLArgMinMaxKernel op on CPU
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
+  void run_on_cpu(cl::CommandQueue &queue);
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  uint32_t _argminmax_axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLargminmaxMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h
new file mode 100644
index 000000000..9a765f310
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the arithmetic subtraction kernel (support broadcasting)
+ *
+ * Arithmetic subtraction is computed by:
+ * @f[ output(x,y) = input1(x,y) - input2(x,y) @f]
+ */
+class CLArithmeticSubtractionExKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLArithmeticSubtractionExKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLArithmeticSubtractionExKernel(const CLArithmeticSubtractionExKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLArithmeticSubtractionExKernel &operator=(const CLArithmeticSubtractionExKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLArithmeticSubtractionExKernel(CLArithmeticSubtractionExKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLArithmeticSubtractionExKernel &operator=(CLArithmeticSubtractionExKernel &&) = default;
+  /** Default destructor */
+  ~CLArithmeticSubtractionExKernel() = default;
+
+  /** Initialise the kernel's inputs, output and convertion policy.
+   *
+   * @param[in]  input1 First tensor input. Data types supported: U8/S16/F16/F32.
+   * @param[in]  input2 Second tensor input. Data types supported: U8/S16/F16/F32.
+   * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8),
+   * S16/F16/F32.
+   * @param[in]  policy Policy to use to handle overflow.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+                 ConvertPolicy policy);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArithmeticSubtractionExKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
+   * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
+   * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8),
+   * S16/F16/F32.
+   * @param[in] policy Policy to use to handle overflow.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, ConvertPolicy policy);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1; /**< Source tensor 1 */
+  const ICLTensor *_input2; /**< Source tensor 2 */
+  ICLTensor *_output;       /**< Destination tensor */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h
new file mode 100644
index 000000000..1387897c9
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform BATCH_TO_SPACE_ND operation */
+class CLBatchToSpaceNDKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLBatchToSpaceNDKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLBatchToSpaceNDKernel(const CLBatchToSpaceNDKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLBatchToSpaceNDKernel &operator=(const CLBatchToSpaceNDKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLBatchToSpaceNDKernel(CLBatchToSpaceNDKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLBatchToSpaceNDKernel &operator=(CLBatchToSpaceNDKernel &&) = default;
+  /** Default destructor */
+  ~CLBatchToSpaceNDKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
new file mode 100644
index 000000000..ab33d9d3a
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/
+class CLBinaryLogicalOpKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLBinaryLogicalOpKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input1  Source tensor1.
+   * @param[in]  input2  Source tensor2.
+   * @param[out] output  Output tensor.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
index 6bd33bf8f..4c2feb903 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
@@ -14,6 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file      CLCastKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLCastKernel class
+ */
+
 #ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
 #define __ARM_COMPUTE_CLCASTKERNEL_H__
 
@@ -23,30 +30,62 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel to perform a cast operation */
+/**
+ * @brief Class to define OpenCL kernel for cast operation
+ */
 class CLCastKernel : public ICLKernel
 {
 public:
-  /** Default constructor */
+  /**
+   * @brief Construct CLCastKernel object
+   */
   CLCastKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
   CLCastKernel(const CLCastKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
   CLCastKernel &operator=(const CLCastKernel &) = delete;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Construct CLCastKernel object using default move constructor
+   * @param[in] CLCastKernel object to move
+   */
   CLCastKernel(CLCastKernel &&) = default;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param[in] CLCastKernel object to move
+   */
   CLCastKernel &operator=(CLCastKernel &&) = default;
-  /** Default destructor */
+
+  /**
+   * @brief Destruct this CLCastKernel object
+   */
   ~CLCastKernel() = default;
-  /** Initialise the kernel's input and output.
-   *
+
+  /**
+   * @brief Initialise the kernel's input and output.
    * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
    * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @return N/A
    */
   void configure(const ICLTensor *input, ICLTensor *output);
 
-  // Inherited methods overridden:
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h
new file mode 100644
index 000000000..f5f455993
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLComparisonOpKernel.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__
+#define __ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to check if values in both tensors are equal*/
+class CLComparisonOpKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLComparisonOpKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLComparisonOpKernel(const CLComparisonOpKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLComparisonOpKernel &operator=(const CLComparisonOpKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLComparisonOpKernel(CLComparisonOpKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLComparisonOpKernel &operator=(CLComparisonOpKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input1  Source tensor1.
+   * @param[in]  input2  Source tensor2.
+   * @param[out] output  Output tensor.
+   */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+                 const ComparisonOperation &op);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
new file mode 100644
index 000000000..60ec7a82a
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform depthTospace operation */
+class CLDepthToSpaceKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLDepthToSpaceKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
+  /** Default destructor */
+  ~CLDepthToSpaceKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
new file mode 100644
index 000000000..da075db69
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLEmbeddingLookupKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLEmbeddingLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform EmbeddingLookup operation with opencl kernel
+*/
+class CLEmbeddingLookupKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct a CLEmbeddingLookupKernel object
+   * */
+  CLEmbeddingLookupKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete;
+
+  /**
+   * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor
+   * @param[in] CLEmbeddingLookupKernel object to move
+   * */
+  CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLEmbeddingLookupKernel object to move
+   * */
+  CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLEmbeddingLookupKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
+   * @param[in]  input          Source tensor.
+   *                            Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  lookups        Lookups are 1D tensor that values are indices into the first
+   *                            dimension of input.
+   *                            Data types supported: S32.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLEmbeddingLookupKernel
+   * @param[in]  input          The input tensor info.
+   *                            Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
+   * @param[in]  lookups        Lookups info. Data types supported: S32.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;   /** Source tensor */
+  ICLTensor *_output;        /** Destination tensor */
+  const ICLTensor *_lookups; /** Lookups tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h
new file mode 100644
index 000000000..a6ea539f8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLExpKernel.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLEXPKERNEL_H__
+#define __ARM_COMPUTE_CLEXPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform an exponential operation */
+class CLExpKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLExpKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLExpKernel(const CLExpKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLExpKernel &operator=(const CLExpKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLExpKernel(CLExpKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLExpKernel &operator=(CLExpKernel &&) = default;
+  /** Default destructor */
+  ~CLExpKernel() = default;
+  /** Set the source, destination of the kernel
+   *
+   * @param[in]  input  Source tensor. Data type supported: F32.
+   * @param[out] output Destination tensor. Data type supported: F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLEXPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
index a51441aca..7e35a80b0 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h
@@ -14,52 +14,85 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file      CLGatherKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLGatherKernel class
+ */
+
 #ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__
 #define __ARM_COMPUTE_CLGATHERKERNEL_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the gather kernel.
- *
+/**
+ * @brief Class to define an interface for the gather kernel.
  */
 class CLGatherKernel : public ICLKernel
 {
 public:
-  /** Default constructor.*/
+  /**
+   * @brief Construct CLGatherKernel object
+   * */
   CLGatherKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
   CLGatherKernel(const CLGatherKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
   CLGatherKernel &operator=(const CLGatherKernel &) = delete;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Construct CLGatherKernel object by using default move constructor
+   * @param[in] CLGatherKernel object to move
+   */
   CLGatherKernel(CLGatherKernel &&) = default;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLGatherKernel object to move
+   */
   CLGatherKernel &operator=(CLGatherKernel &&) = default;
-  /** Initialise the kernel's input, output and border mode.
-   *
+
+  /**
+   * @brief Initialise the kernel's input, output and border mode.
    * @param[in]  input1          An input tensor. Data types supported: U8/S32/F32.
    * @param[in]  input2          An input tensor. Data types supported: S32.
    * @param[out] output          The output tensor, Data types supported: same as @p input1.
+   * @return N/A
    */
   void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
    * CLGatherKernel
-   *
    * @param[in]  input1          An input tensor. Data types supported: U8/S32/F32.
    * @param[in]  input2          An input tensor. Data types supported: S32.
    * @param[out] output          The output tensor, Data types supported: same as @p input1.
-   *
    * @return a status
    */
   static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
                          const ITensorInfo *output);
 
-  // Inherited methods overridden:
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
new file mode 100644
index 000000000..c3fc15637
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file      CLHashtableLookupKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLHashtableLookupKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+* @brief Class to perform HashtableLookup operation with opencl kernel
+*/
+class CLHashtableLookupKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Construct a CLHashtableLookupKernel object
+   * */
+  CLHashtableLookupKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete;
+
+  /**
+   * @brief Construct a CLHashtableLookupKernel object by using default move constructor
+   * @param[in] CLHashtableLookupKernel object to move
+   * */
+  CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLHashtableLookupKernel object to move
+   * */
+  CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLHashtableLookupKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input,
+                 ICLTensor *output, ICLTensor *hits);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLHashtableLookupKernel
+   * @param[in]  lookups  The lookups tensor info. Data types supported: S32.
+   * @param[in]  keys     The keys tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    The input tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   The output tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     The hits tensor info. A boolean tensor that indicates whether the lookup
+   *                      hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_lookups;                          /** Lookups tensor */
+  const ICLTensor *_keys;                             /** Keys tensor */
+  const ICLTensor *_input;                            /** Source tensor */
+  ICLTensor *_output;                                 /** Destination tensor */
+  ICLTensor *_hits;                                   /** Hits tensor */
+  std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
new file mode 100644
index 000000000..ccbea147e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
+#define __ARM_COMPUTE_CLNEGKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a negation operation on tensor*/
+class CLNegKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLNegKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLNegKernel(const CLNegKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLNegKernel &operator=(const CLNegKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLNegKernel(CLNegKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLNegKernel &operator=(CLNegKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input  Source tensor.
+   * @param[out] output Destination tensor.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h
new file mode 100644
index 000000000..181a6226a
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class CLNormalizationLayerExKernel : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLNormalizationLayerExKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLNormalizationLayerExKernel(const CLNormalizationLayerExKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLNormalizationLayerExKernel &operator=(const CLNormalizationLayerExKernel &) = delete;
+  /** Default Move Constructor. */
+  CLNormalizationLayerExKernel(CLNormalizationLayerExKernel &&) = default;
+  /** Default move assignment operator */
+  CLNormalizationLayerExKernel &operator=(CLNormalizationLayerExKernel &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                       and an optional 4th dimension for batch of inputs. Data types supported:
+   * F16/F32.
+   * @param[out] output    Destination tensor. Output will have the same number of dimensions as
+   * input. Data types supported: same as @p input.
+   * @param[in]  norm_info Normalization layer information like the normalization type,
+   * normalization size and other parameters.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLNormalizationLayerKernel
+   *
+   * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                      and an optional 4th dimension for batch of inputs. Data types supported:
+   * F16/F32.
+   * @param[in] output    Destination tensor. Output will have the same number of dimensions as
+   * input. Data types supported: same as @p input.
+   * @param[in] norm_info Normalization layer information like the normalization type, normalization
+   * size and other parameters.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         NormalizationLayerInfo norm_info);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  BorderSize _border_size;
+  bool _is_in_map;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
new file mode 100644
index 000000000..eff1b8bd5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
+#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to calculate PReLU*/
+class CLPReLUKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPReLUKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLPReLUKernel(const CLPReLUKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPReLUKernel(CLPReLUKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
+  /** Initialize the kernel's input, output.
+   *
+   * @param[in]  input  Source tensor1.
+   * @param[in]  alpha  Source tensor2.
+   * @param[out] output  Output tensor.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input;
+  const ICLTensor *_alpha;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h
new file mode 100644
index 000000000..cbaa2adee
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h
@@ -0,0 +1,60 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform PAD operation */
+class CLPadLayerKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPadLayerKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernel(const CLPadLayerKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernel(CLPadLayerKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
+  /** Default destructor */
+  ~CLPadLayerKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  pad_size Padding Size tensor. Data types supported : S32
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+  ICLTensor *_pad_size;    /**< Padding Size tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h
new file mode 100644
index 000000000..3434deee8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPermuteExKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__
+#define __ARM_COMPUTE_CLPERMUTEEXKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class CLPermuteExKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPermuteExKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPermuteExKernel(const CLPermuteExKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPermuteExKernel &operator=(const CLPermuteExKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPermuteExKernel(CLPermuteExKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPermuteExKernel &operator=(CLPermuteExKernel &&) = default;
+  /** Set the input and output of the kernel.
+   *
+   * @param[in] input  The input tensor to permute. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+   * @param[in] output The output tensor. Data types supported: Same as @p input
+   * @param[in] perm   Permutation vector
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPermuteKernel
+   *
+   * @param[in] input  First tensor input info. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: same as @p input.
+   * @param[in] perm   Permutation vector
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PermutationVector &perm);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  PermutationVector _perm;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_CLPERMUTEEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
index cd2b255bc..d579f5d8f 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h
@@ -14,68 +14,106 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file      CLPixelWiseDivisionKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLPixelWiseDivisionKernel class
+ */
+
 #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
 #define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the pixelwise division kernel.
- *
+/**
+ * @brief Interface for the pixelwise division kernel.
  */
 class CLPixelWiseDivisionKernel : public ICLKernel
 {
 public:
-  /** Default constructor.*/
+  /**
+   * @brief Construct a CLPixelWiseDivisionKernel object
+   */
   CLPixelWiseDivisionKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
   CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   */
   CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Construct a CLPixelWiseDivisionKernel object by using move constructor
+   * @param[in] CLPixelWiseDivisionKernel object to move
+   */
   CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param[in] CLPixelWiseDivisionKernel object to move
+   */
   CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default;
-  /** Initialise the kernel's input, output and border mode.
-   *
-   * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+
+  /**
+   * @brief Initialise the kernel's input, output and border mode.
+   * @param[in]  input1          An input tensor. Data types supported: U8/S16/F16/F32.
    * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
    * @param[out] output          The output tensor, Data types supported: same as @p input1. Note:
-   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * U8 requires both inputs to be U8.
    * @param[in]  scale           Scale to apply after division.
    *                             Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   *                             where n is between 0 and 15.
    * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
    * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
    * even.
+   * @return N/A
    */
   void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
                  ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-  /** Static function to check if given info will lead to a valid configuration of @ref
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
    * CLPixelWiseDivisionKernel
-   *
-   * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32.
    * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
    * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * Note: U8 requires both inputs to be U8.
    * @param[in] scale           Scale to apply after division.
    *                            Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * where n is between 0 and 15.
    * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
    * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-   *
    * @return a status
    */
   static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
                          const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
                          RoundingPolicy rounding_policy);
 
-  // Inherited methods overridden:
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
+
+  /**
+   * @brief The size of the border for that kernel
+   * @return The width in number of elements of the border.
+   */
   BorderSize border_size() const override;
 
 private:
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
deleted file mode 100644
index a7d96cc5c..000000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
-#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the pixelwise division kernel.
- *
- */
-class CLReduceMaxKernel : public ICLKernel
-{
-public:
-  /** Default constructor.*/
-  CLReduceMaxKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLReduceMaxKernel(const CLReduceMaxKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLReduceMaxKernel(CLReduceMaxKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default;
-  /** Initialise the kernel's input, output and border mode.
-   *
-   * @param[in]  input          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
-   * @param[in] axis            Axis to reduce
-   * @param[out] output          The output tensor, Data types supported: same as @p input1. Note:
-   * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
-   */
-  void configure(const ICLTensor *input, int32_t axis, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLReduceMaxKernel
-   *
-   * @param[in] input           An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
-   * @param[in] axis            Axis to reduce
-   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-  void run_on_cpu(cl::CommandQueue &queue);
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  int32_t _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
new file mode 100644
index 000000000..a26a4a7fc
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperationKernel.h
+ * @brief This file defines CLReduceOperationKernel class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to define interface for the reduce operation kernel
+ */
+class CLReduceOperationKernel : public ICLKernel
+{
+public:
+  /**
+   * @brief Default constructor
+   */
+  CLReduceOperationKernel();
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLReduceOperationKernel(const CLReduceOperationKernel &) = delete;
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete;
+  /**
+   * @brief Allow instances of this class to be moved
+   */
+  CLReduceOperationKernel(CLReduceOperationKernel &&) = default;
+  /**
+   * @brief Allow instances of this class to be moved
+   */
+  CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default;
+  /**
+   * @brief Default destructor
+   */
+  ~CLReduceOperationKernel() = default;
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input  Source tensor. Data types supported: U8/S32/F32.
+   * @param[out] output Destination tensor. Data types supported: Same as @p input.
+   *                    Output will have the same number of dimensions as input.
+   * @param[in]  axis   Axis along which to reduce.
+   * @param[in]  op     Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+                 ReduceOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperationKernel.
+   * @param[in] input  Source tensor info. Data types supported: U8/S32/F32.
+   * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+   *                   Output will have the same number of dimensions as input.
+   * @param[in] axis   Axis along which to reduce.
+   * @param[in] op     Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                         ReduceOperation op);
+
+  /*
+   * @brief Run CLReduceOperationKernel op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   CLQueue
+   * @return N/A
+   */
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  uint32_t _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
deleted file mode 100644
index de9df3381..000000000
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
-#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel */
-class CLReductionMeanKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLReductionMeanKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLReductionMeanKernel(const CLReductionMeanKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLReductionMeanKernel(CLReductionMeanKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default;
-  /** Default destructor */
-  ~CLReductionMeanKernel() = default;
-
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
-   * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-   *                    Output will have the same number of dimensions as input.
-   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0, 1
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLReductionMeanKernel.
-   *
-   * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
-   * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
-   * input.
-   *                   Output will have the same number of dimensions as input.
-   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0, 1
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         std::vector<uint32_t> axis);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-  BorderSize border_size() const override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  std::vector<uint32_t> _reduction_axis;
-  BorderSize _border_size;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
new file mode 100644
index 000000000..68534f1ab
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */
+class CLSpaceToBatchNDKernel final : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSpaceToBatchNDKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default;
+  /** Default destructor */
+  ~CLSpaceToBatchNDKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
+   * @param[in]  input         Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                           Data layout supported: NCHW/NHWC
+   * @param[in]  block_size    Block size tensor. Data types supported: S32.
+   * @param[in]  padding_size  Padding size tensor. Data types supported: S32.
+   * @param[out]  output        Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;        /**< Source tensor */
+  const ICLTensor *_block_size;   /**< Block size tensor */
+  const ICLTensor *_padding_size; /**< Padding size tensor */
+  ICLTensor *_output;             /**< Destination tensor */
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
new file mode 100644
index 000000000..be845a549
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform spaceTodepth operation */
+class CLSpaceToDepthKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSpaceToDepthKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+  /** Default destructor */
+  ~CLSpaceToDepthKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h
new file mode 100644
index 000000000..a4c44e35d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__
+#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to return squared difference value of two tensors (x-y)^2*/
+class CLSquaredDifferenceKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSquaredDifferenceKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLSquaredDifferenceKernel(const CLSquaredDifferenceKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  CLSquaredDifferenceKernel &operator=(const CLSquaredDifferenceKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSquaredDifferenceKernel(CLSquaredDifferenceKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSquaredDifferenceKernel &operator=(CLSquaredDifferenceKernel &&) = default;
+  /** Initialize the kernel's input, output.
+ *
+ * @param[in]  input1  Source tensor1.
+ * @param[in]  input2  Source tensor2.
+ * @param[out] output  Output tensor.
+ */
+  void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+  BorderSize border_size() const override;
+
+private:
+  const ICLTensor *_input1;
+  const ICLTensor *_input2;
+  ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h
index 248ae6635..6368c380e 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceExKernel.h
@@ -14,36 +14,64 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__
+
+/**
+ * @file      CLStridedSliceExKernel.h
+ * @ingroup   COM_AI_RUNTIME
+ * @brief     This file defines CLStridedSliceExKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the kernel to extract a strided slice of a tensor */
-class CLStridedSliceKernel : public ICLKernel
+/**
+* @brief Class to define an interface for the kernel to extract a strided slice of a tensor
+*/
+class CLStridedSliceExKernel : public ICLKernel
 {
 public:
-  /** Default constructor */
-  CLStridedSliceKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
-  /** Default destructor */
-  ~CLStridedSliceKernel() = default;
-  /** Set the input and output of the kernel
-   *
+  /**
+   * @brief Construct a CLStridedSliceExKernel object
+   * */
+  CLStridedSliceExKernel();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLStridedSliceExKernel(const CLStridedSliceExKernel &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * */
+  CLStridedSliceExKernel &operator=(const CLStridedSliceExKernel &) = delete;
+
+  /**
+   * @brief Construct a CLStridedSliceExKernel object by using default move constructor
+   * @param[in] CLStridedSliceExKernel object to move
+   * */
+  CLStridedSliceExKernel(CLStridedSliceExKernel &&) = default;
+
+  /**
+   * @brief Move assignment operator
+   * @param[in] CLStridedSliceExKernel object to move
+   * */
+  CLStridedSliceExKernel &operator=(CLStridedSliceExKernel &&) = default;
+
+  /**
+   * @brief Destruct this object
+   * */
+  ~CLStridedSliceExKernel() = default;
+
+  /**
+   * @brief Set the input and output of the kernel
    * @param[in]  input          Source tensor. Data type supported:
-   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
    * @param[out] output         Destination tensor. Data type supported: Same as @p input
    * @param[in]  beginData      The begin tensor. Data types supported: S32.
    *                            The number of dimensions must be 1.
@@ -57,17 +85,17 @@ public:
    * @param[in]  beginMask      Mask for begin
    * @param[in]  endMask        Mask for end
    * @param[in]  shrinkAxisMask Mask for shrink axis.
-   *
+   * @return N/A
    */
   void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
                  ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
                  int32_t shrinkAxisMask);
 
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLStridedSliceKernel
-   *
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   * CLStridedSliceExKernel
    * @param[in]  input          The input tensor info. Data types supported:
-   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
    * @param[in]  output         The output tensor info, Data types supported: same as @p input1.
    * @param[in]  begin          The begin tensor info. Data types supported: S32.
    *                            The number of dimensions must be 1.
@@ -81,7 +109,6 @@ public:
    * @param[in]  beginMask      Mask for begin
    * @param[in]  endMask        Mask for end
    * @param[in]  shrinkAxisMask Mask for shrink axis.
-   *
    * @return a status
    */
   static Status validate(const ITensorInfo *input, const ITensorInfo *output,
@@ -89,7 +116,16 @@ public:
                          const ITensorInfo *stride, int32_t beginMask, int32_t endMask,
                          int32_t shrinkAxisMask);
 
-  // Inherited methods overridden:
+  /**
+   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+   *        queue.
+   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+   *        been executed by the time this method returns.
+   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+   *                        the window returned by window()).
+   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
@@ -103,4 +139,4 @@ private:
   int32_t _shrinkAxisMask; /** Shrink axis mask */
 };
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
index 5c567f38e..eb2bad254 100644
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -14,14 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLTopKV2Kernel.h
+ * @brief This file defines classes for TopKV2Kernel
+ * @ingroup COM_AI_RUNTIME
+ */
+
 #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 
-#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 
-#include <array>
-
 // these parameters can be changed
 #define _ITEMS 16                          // number of items in a group
 #define _GROUPS 4                          // the number of virtual processors is _ITEMS * _GROUPS
@@ -33,24 +37,59 @@ namespace arm_compute
 {
 class ICLTensor;
 
+/**
+ * @brief Class to define CLTopKV2Single
+ */
 class CLTopKV2Single : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLTopKV2Single();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+   */
   CLTopKV2Single(const CLTopKV2Single &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied
+   * @return Reference of this instance
+   */
   CLTopKV2Single &operator=(const CLTopKV2Single &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+   */
   CLTopKV2Single(CLTopKV2Single &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved
+   * @return Reference of this instance
+   */
   CLTopKV2Single &operator=(CLTopKV2Single &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[in] input An input tensor
+   * @param[in] topk_values Values of the top k predictions
+   * @param[in] topk_indices Indices of the top k predictions
+   * @param[in] indices Indices
+   * @param[in] temp_stack Temp stack
+   * @param[in] k K of the top k predictions
+   * @param[in] n Number times to quick-sort
+   * return N/A
+   */
   void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
                  cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n);
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLTopKV2Single op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
@@ -59,52 +98,121 @@ private:
   ICLTensor *_topk_indices;
 };
 
+/**
+ * @brief Class to define CLTopKV2Init
+ */
 class CLTopKV2Init : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLTopKV2Init();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+   */
   CLTopKV2Init(const CLTopKV2Init &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied
+   * @return Reference of this instance
+   */
   CLTopKV2Init &operator=(const CLTopKV2Init &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+   */
   CLTopKV2Init(CLTopKV2Init &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved
+   * @return Reference of this instance
+   */
   CLTopKV2Init &operator=(CLTopKV2Init &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[in] input An input tensor
+   * @param[in] in_key_buf Buffer of input key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[in] n Number times to quick-sort
+   * return N/A
+   */
   void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n);
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLTopKV2Init op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
   ICLTensor *_input;
 };
 
+/**
+ * @brief Class to define CLRadixSortHistogram
+ */
 class CLRadixSortHistogram : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLRadixSortHistogram();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+   */
   CLRadixSortHistogram(const CLRadixSortHistogram &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied
+   * @return Reference of this instance
+   */
   CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+   */
   CLRadixSortHistogram(CLRadixSortHistogram &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved
+   * @return Reference of this instance
+   */
   CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * @param[in] n Integer number size to sort
+   * return N/A
+   */
   void configure(cl::Buffer *hist_buf, int bits, int n);
 
+  /**
+   * @brief Set pass
+   * @param[in] pass Passes made of in radix sort algorithm
+   * @param[in] in_key_buf Buffer of input key
+   * return N/A
+   */
   void setPass(int pass, cl::Buffer *in_key_buf)
   {
     _pass = pass;
     _in_key_buf = in_key_buf;
   }
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLRadixSortHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
@@ -112,82 +220,210 @@ private:
   cl::Buffer *_in_key_buf;
 };
 
+/**
+ * @brief Class to define CLRadixSortScanHistogram
+ */
 class CLRadixSortScanHistogram : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLRadixSortScanHistogram();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+   */
   CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied
+   * @return Reference of this instance
+   */
   CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+   */
   CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved
+   * @return Reference of this instance
+   */
   CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
   void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLRadixSortScanHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 };
 
+/**
+ * @brief Class to define CLRadixSortGlobalScanHistogram
+ */
 class CLRadixSortGlobalScanHistogram : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLRadixSortGlobalScanHistogram();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+   */
   CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied
+   * @return Reference of this instance
+   */
   CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+   */
   CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved
+   * @return Reference of this instance
+   */
   CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
   void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits);
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLRadixSortGlobalScanHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 };
 
+/**
+ * @brief Class to define CLRadixSortPasteHistogram
+ */
 class CLRadixSortPasteHistogram : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLRadixSortPasteHistogram();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+   */
   CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied
+   * @return Reference of this instance
+   */
   CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+   */
   CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved
+   * @return Reference of this instance
+   */
   CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[out] glob_sum_buf Buffer of global sum
+   * @param[in] bits Number of bits to be used for radix sort
+   * return N/A
+   */
   void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits);
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLRadixSortPasteHistogram op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 };
 
+/**
+ * @brief Class to define CLRadixSortReorder
+ */
 class CLRadixSortReorder : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLRadixSortReorder();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+   */
   CLRadixSortReorder(const CLRadixSortReorder &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied
+   * @return Reference of this instance
+   */
   CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+   */
   CLRadixSortReorder(CLRadixSortReorder &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved
+   * @return Reference of this instance
+   */
   CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] hist_buf Buffer of histogram
+   * @param[in] bits Number of bits to be used for radix sort
+   * @param[in] n Integer number size to sort
+   * return N/A
+   */
   void configure(cl::Buffer *hist_buf, int bits, int n);
 
+  /**
+   * @brief Set pass
+   * @param[in] pass Passes made of in radix sort algorithm
+   * @param[in] in_key_buf Buffer of input key
+   * @param[out] out_key_buf Buffer of output key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
   void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
                cl::Buffer *out_ind_buf)
   {
@@ -197,7 +433,12 @@ public:
     _in_ind_buf = in_ind_buf;
     _out_ind_buf = out_ind_buf;
   }
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLRadixSortReorder op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
@@ -208,47 +449,115 @@ private:
   cl::Buffer *_out_ind_buf;
 };
 
+/**
+ * @brief Class to define CLTopKV2FindFirstNegative
+ */
 class CLTopKV2FindFirstNegative : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLTopKV2FindFirstNegative();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+   */
   CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied
+   * @return Reference of this instance
+   */
   CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+   */
   CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved
+   * @return Reference of this instance
+   */
   CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] first_negative_idx_buf Buffer of the first negative index
+   * @param[in] n Number times to find
+   * return N/A
+   */
   void configure(cl::Buffer *first_negative_idx_buf, int n);
 
+  /**
+   * @brief Set output buffer
+   * @param[out] out_key_buf Buffer of output key
+   * return N/A
+   */
   void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; }
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLTopKV2FindFirstNegative op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
   cl::Buffer *_out_key_buf;
 };
 
+/**
+ * @brief Class to define CLTopKV2ReorderNegatives
+ */
 class CLTopKV2ReorderNegatives : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLTopKV2ReorderNegatives();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+   */
   CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied
+   * @return Reference of this instance
+   */
   CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+   */
   CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved
+   * @return Reference of this instance
+   */
   CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] first_negative_idx_buf Buffer of the first negative index
+   * @param[in] n Number times to find
+   * return N/A
+   */
   void configure(cl::Buffer *first_negative_idx_buf, int n);
 
+  /**
+   * @brief Set buffers
+   * @param[in] in_key_buf Buffer of input key
+   * @param[out] out_key_buf Buffer of output key
+   * @param[in] in_ind_buf Buffer of input index
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
   void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf,
                   cl::Buffer *out_ind_buf)
   {
@@ -258,7 +567,12 @@ public:
     _out_ind_buf = out_ind_buf;
   }
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLTopKV2ReorderNegatives op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
@@ -268,25 +582,63 @@ private:
   cl::Buffer *_out_ind_buf;
 };
 
+/**
+ * @brief Class to define CLTopKV2Store
+ */
 class CLTopKV2Store : public ICLKernel
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Constructor
+   */
   CLTopKV2Store();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+   */
   CLTopKV2Store(const CLTopKV2Store &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+   * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied
+   * @return Reference of this instance
+   */
   CLTopKV2Store &operator=(const CLTopKV2Store &) = delete;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+   */
   CLTopKV2Store(CLTopKV2Store &&) = default;
-  /** Allow instances of this class to be moved */
+  /**
+   * @brief Allow instances of this class to be moved
+   * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved
+   * @return Reference of this instance
+   */
   CLTopKV2Store &operator=(CLTopKV2Store &&) = default;
 
+  /**
+   * @brief Initialise kernel with params
+   * @param[out] values Values tensor to store
+   * @param[out] indices Indices tensor to be used for store
+   * @param[in] k K of the top k predictions
+   * @param[in] n Number times to store
+   * return N/A
+   */
   void configure(ICLTensor *values, ICLTensor *indices, int k, int n);
 
+  /**
+   * @brief Set buffers
+   * @param[out] out_key_buf Buffer of output key
+   * @param[out] out_ind_buf Buffer of output index
+   * return N/A
+   */
   void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf);
 
-  // Inherited methods overridden:
+  /*
+   * @brief Run CLTopKV2Store op
+   * @param[in] window  Window to be used for in_slice
+   * @param[in] queue   cl::CommandQueue
+   * @return N/A
+   */
   void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
diff --git a/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h
new file mode 100644
index 000000000..f7bf72985
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class NENormalizationLayerExKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NENormalizationLayerKernel"; }
+  /** Default constructor */
+  NENormalizationLayerExKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NENormalizationLayerExKernel(const NENormalizationLayerExKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NENormalizationLayerExKernel &operator=(const NENormalizationLayerExKernel &) = delete;
+  /** Default Move Constructor. */
+  NENormalizationLayerExKernel(NENormalizationLayerExKernel &&) = default;
+  /** Default move assignment operator */
+  NENormalizationLayerExKernel &operator=(NENormalizationLayerExKernel &&) = default;
+  /** Default destructor */
+  ~NENormalizationLayerExKernel() = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                           and an optional 4th dimension for batch of inputs. Data types
+   * supported: FP16/F32.
+   * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a
+   * single input with dimensions [width, height, IFM],
+   *                           Data type supported: same as @p input
+   * @param[out] output        Destination tensor. Output will have the same number of dimensions as
+   * input. Data type supported: same as @p input
+   * @param[in]  norm_info     Normalization layer information like the normalization type,
+   * normalization size and other parameters.
+   */
+  void configure(const ITensor *input, const ITensor *input_squared, ITensor *output,
+                 NormalizationLayerInfo norm_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NENormalizationLayerKernel
+   *
+   * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                          and an optional 4th dimension for batch of inputs. Data types
+   * supported: FP16/F32.
+   * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a
+   * single input with dimensions [width, height, IFM],
+   *                          Data type supported: same as @p input
+   * @param[in] output        Destination tensor. Output will have the same number of dimensions as
+   * input. Data type supported: same as @p input
+   * @param[in] norm_info     Normalization layer information like the normalization type,
+   * normalization size and other parameters.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared,
+                         const ITensorInfo *output, NormalizationLayerInfo norm_info);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+  BorderSize border_size() const override;
+
+private:
+  /** Function to perform normalization depending on the given template
+   *  dimension. The second template parameter specifies whether the
+   *  normalization has to be 1D or 2D.
+   *
+   * @note Only supported normalizations are:
+   *  - 1D over X or Z
+   *  - 2D over X and Y
+   *
+   * @param[in] window Region on which to execute the kernel.
+   */
+  template <DataType dt, unsigned int dim, bool do_2D_norm>
+  void normalize_float(const Window &window);
+
+  /** Common signature for all the specialised normalization functions
+   *
+   * @param[in] window Region on which to execute the kernel.
+   */
+  using NormalizationFunctionEx = void (NENormalizationLayerExKernel::*)(const Window &window);
+
+private:
+  NormalizationFunctionEx _func;
+  const ITensor *_input;
+  const ITensor *_input_squared;
+  ITensor *_output;
+  NormalizationLayerInfo _norm_info;
+  BorderSize _border_size;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYEREXKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/TypesEx.h b/libs/ARMComputeEx/arm_compute/core/TypesEx.h
new file mode 100644
index 000000000..8381f1cc6
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/TypesEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_TYPESEX_H__
+#define __ARM_COMPUTE_TYPESEX_H__
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+
+/** Available ArgIndex operations **/
+enum class ArgOperation
+{
+  MAX,
+  MIN,
+};
+
+/** Available reduce operations */
+enum class ReduceOperation
+{
+  MAX,  /**< Max */
+  MEAN, /**< Mean */
+  SUM,  /**< Sum */
+  MIN,  /**< Min */
+};
+
+/** Available binary logical operations */
+enum class BinaryLogicalOperation
+{
+  AND, /**< AND */
+  OR,  /**< OR */
+};
+
+enum class ComparisonOperation
+{
+  EQUAL,     /**< EQUAL */
+  NOT_EQUAL, /**< NOT_EQUAL */
+};
+
+/** Activation Layer Information class */
+class ActivationLayerInfoEx
+{
+public:
+  /** Available activation functions */
+  enum class ActivationFunction
+  {
+    RSQRT /**< Inverse Square root ( \f$ f(x) = \rsqrt{x} \f$ )*/
+  };
+
+  ActivationLayerInfoEx() = default;
+  /** Default Constructor
+   *
+   * @param[in] f The activation function to use.
+   * @param[in] a (Optional) The alpha parameter used by some activation functions
+   *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU,
+   * @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+   * @param[in] b (Optional) The beta parameter used by some activation functions (@ref
+   * ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref
+   * ActivationFunction::TANH).
+   */
+  ActivationLayerInfoEx(ActivationFunction f, float a = 0.0f, float b = 0.0f)
+      : _act(f), _a(a), _b(b), _enabled(true)
+  {
+  }
+  /** Get the type of activation function */
+  ActivationFunction activation() const { return _act; }
+  /** Get the alpha value */
+  float a() const { return _a; }
+  /** Get the beta value */
+  float b() const { return _b; }
+  /** Check if initialised */
+  bool enabled() const { return _enabled; }
+
+private:
+  ActivationFunction _act = {ActivationLayerInfoEx::ActivationFunction::RSQRT};
+  float _a = {};
+  float _b = {};
+  bool _enabled = {false};
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TYPESEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/UtilsEx.h b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h
new file mode 100644
index 000000000..8dd68a0c3
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_UTILSEX_H__
+#define __ARM_COMPUTE_UTILSEX_H__
+
+#include "arm_compute/core/TypesEx.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+
+namespace arm_compute
+{
+/** Translates a given activation function to a string.
+ *
+ * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
+ *
+ * @return The string describing the activation function.
+ */
+const std::string &string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act);
+}
+#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h
new file mode 100644
index 000000000..7e578550f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLActivationLayerEx.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLActivationLayerExKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class CLActivationLayerEx : public ICLSimpleFunction
+{
+public:
+  /** Set the input and output tensor.
+   *
+   * @note If the output tensor is a nullptr or is equal to the input, the activation function will
+   * be performed in-place
+   *
+   * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will
+   * store the result
+   *                          of the activation function. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out]     output   Destination tensor. Data type supported: same as @p input
+   * @param[in]      act_info Activation layer parameters.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfoEx act_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLActivationLayer
+   *
+   * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor
+   * will store the result
+   *                     of the activation function. Data types supported: QASYMM8/F16/F32.
+   * @param[in] output   Destination tensor info. Data type supported: same as @p input
+   * @param[in] act_info Activation layer information.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ActivationLayerInfoEx &act_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLACTIVATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h
new file mode 100644
index 000000000..8044c58af
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMax.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLArgMinMax.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLArgMinMax class
+ */
+
+#ifndef __ARM_COMPUTE_CLARG_MIN_MAX_H__
+#define __ARM_COMPUTE_CLARG_MIN_MAX_H__
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to execute CLArgMinMax operation
+ */
+class CLArgMinMax : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new CLArgMinMax object
+   */
+  CLArgMinMax();
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLArgMinMax(const CLArgMinMax &) = delete;
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
+  CLArgMinMax &operator=(const CLArgMinMax &) = delete;
+
+  /**
+   * @brief Construct a new CLArgMinMax object by using copy constructor
+   * @param[in] CLArgMinMax object to move
+   */
+  CLArgMinMax(CLArgMinMax &&) = default;
+
+  /**
+   * @brief Assign a CLArgMinMax object.
+   * @param[in] CLArgMinMax object to assign. This object will be moved.
+   */
+  CLArgMinMax &operator=(CLArgMinMax &&) = default;
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[out] output    The result of argminmaxMax operation. Data types supported: same as @p
+   * input.
+   * @param[in]  axis      Axis to argminmax. It must be sorted and no duplicates.
+   * @param[in] is_min     True for ArgMin operation.
+   * @param[in] is_max     Ture for ArgMax operation.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> argminmax_axis,
+                 ArgOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+   * @param[in]  axis      Axis to argminmax
+   * @param[out] output    The result of argminmaxMax operation. Data types supported: same as @p
+   * input.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis,
+                         const ITensorInfo *output, ArgOperation op);
+
+  /**
+   * @brief Run the kernels contained in the function
+   * This operation works on CPU on GPU depending on the value of argminmax_MAX_RUN_ON_CPU macro
+   * in CLArgMinMax.cpp.
+   * If argminmax_MAX_RUN_ON_CPU == 1, CPU runs this operation.
+   * Otherwise GPU runs this operation.
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  std::vector<uint32_t> _argminmax_axis;
+  ArgOperation _arg_op;
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLArgMinMaxKernel[]> _argminmax_kernels{nullptr};
+  size_t _num_of_kernels;
+};
+}
+#endif /*__ARM_COMPUTE_CLargminmax_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h
new file mode 100644
index 000000000..34e6c6334
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticSubtractionExKernel
+ *
+ * @note The tensor data type for the inputs must be U8/S16/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtractionEx : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and convertion policy.
+   *
+   * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32.
+   *                        The input tensor is [in, out] because its TensorInfo might be modified
+   * inside the kernel in case of broadcasting of dimension 0.
+   * @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
+   *                        The input tensor is [in, out] because its TensorInfo might be modified
+   * inside the kernel in case of broadcasting of dimension 0.
+   * @param[out]     output Output tensor. Data types supported: U8 (Only if both inputs are U8),
+   * S16/F16/F32.
+   * @param[in]      policy Policy to use to handle overflow.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLArithmeticSubtractionEx
+   *
+   * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
+   * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
+   * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8),
+   * S16/F16/F32.
+   * @param[in] policy Policy to use to handle overflow.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
new file mode 100644
index 000000000..d16a0762d
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBatchToSpaceNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLBatchToSpaceND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  block_size         A pointer to an array of integer values specifying block sizes
+   *                                for spatial dimension.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
new file mode 100644
index 000000000..061e34f26
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLBinaryLogicalOp : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported: U8, QASYMM8.
+   * @param[in]  input2  Source tensor2. Data types supported: U8 QASYMM8.
+   * @param[out] output Output tensor. Data types supported: U8, QASYMM8.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                 BinaryLogicalOperation op);
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
index 63050067d..56b8408e2 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
@@ -14,30 +14,35 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLCast.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCast class
+ */
+
 #ifndef __ARM_COMPUTE_CLCAST_H__
 #define __ARM_COMPUTE_CLCAST_H__
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLCastKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
+/**
+ * @brief Class to run @ref CLCastKernel.
+ * This converts the input tensor to the tensor of the output tensor's type.
  */
 class CLCast : public ICLSimpleFunction
 {
 public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   *                       The input tensor is [in, out] because its TensorInfo might be modified
-   * inside the kernel.
-   * @param[out]     output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+  /**
+   * @brief Initialise the kernel's input and output
+   * @param[in, out] input    Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   *                          The input tensor is [in, out] because its TensorInfo might be
+   *                          modified inside the kernel.
+   * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
    */
   void configure(ICLTensor *input, ICLTensor *output);
 };
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h
new file mode 100644
index 000000000..1b0d70e7f
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLComparisonOp.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLCOMPARISON_OP_H__
+#define __ARM_COMPUTE_CLCOMPARISON_OP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLComparisonOp : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  input2  Source tensor2. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                 const ComparisonOperation &op);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLCOMPARISON_OP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
new file mode 100644
index 000000000..d78a6ada4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthToSpaceKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLDepthToSpace : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+} // namesace arm_compute
+
+#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
new file mode 100644
index 000000000..257772a89
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class CLEmbeddingLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h
new file mode 100644
index 000000000..2d0fc23a4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLExp.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLEXP_H__
+#define __ARM_COMPUTE_CLEXP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLExpKernel */
+class CLExp : public ICLSimpleFunction
+{
+public:
+  /** Set the source, destination of the kernel
+   *
+   * @param[in]  input  Source tensor. Data type supported: F32.
+   * @param[out] output Destination tensor. Data type supported: F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLEXP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
index 3ae7afe14..f7fd3cda1 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h
@@ -14,32 +14,43 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLGather.h
+ * @brief       This file contains CLGather class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __ARM_COMPUTE_CLGATHER_H__
 #define __ARM_COMPUTE_CLGATHER_H__
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLGatherKernel. */
+/**
+ * @brief Class to to run @ref CLGatherKernel.
+ */
 class CLGather : public ICLSimpleFunction
 {
 public:
-  /** Initialise the kernel's inputs, output and convertion policy.
-   *
-   * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
-   * @param[in] input2          An indexes tensor. Data types supported: S32.
-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
-   */
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in]  input1   An input tensor. Data types supported: U8/S32/F32.
+   * @param[in]  input2   An indexes tensor. Data types supported: S32.
+   * @param[out] output   The output tensor, Data types supported: same as @p input1.
+   * @return N/A
+ */
   void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref CLGather
-   *
-   * @param[in] input1          An input tensor. Data types supported: U8/S32/F32.
-   * @param[in] input2          An indexes tensor. Data types supported: S32.
-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration
+   *        of @ref CLGather
+   * @param[in]  input1   An input tensor. Data types supported: U8/S32/F32.
+   * @param[in]  input2   An indexes tensor. Data types supported: S32.
+   * @param[out] output   The output tensor, Data types supported: same as @p input1.
    * @return a status
    */
   static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
new file mode 100644
index 000000000..65aa6cbd5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class CLHashtableLookup : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input.
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
+                 ICLTensor *output, ICLTensor *hits);
+};
+}
+#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
new file mode 100644
index 000000000..198a0fd4e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNEG_H__
+#define __ARM_COMPUTE_CLNEG_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLNeg : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input  Source tensor. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLNEG_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h
new file mode 100644
index 000000000..4077245d5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute a normalization layer. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLNormalizationLayerKernelEx
+ *
+ */
+class CLNormalizationLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLNormalizationLayerEx();
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                           and an optional 4th dimension for batch of inputs. Data types
+   * supported: F16/F32 (Written to by the border handler)
+   * @param[out]     output    Destination tensor. Dimensions, data type and number of channels must
+   * match the input ones.
+   * @param[in]      norm_info Normalization layer information like the normalization type,
+   * normalization size and other parameters.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLNormalizationLayer
+   *
+   * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                      and an optional 4th dimension for batch of inputs. Data types supported:
+   * F16/F32
+   * @param[in] output    Destination tensor. Dimensions, data type and number of channels must
+   * match the input ones.
+   * @param[in] norm_info Normalization layer information like the normalization type, normalization
+   * size and other parameters.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const NormalizationLayerInfo &norm_info);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLNormalizationLayerExKernel _norm_kernel; /**< Normalization layer kernel to run */
+  CLFillBorderKernel _border_handler;        /**< Kernel to handle  borders */
+};
+}
+#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
new file mode 100644
index 000000000..622a61b5e
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPRELU_H__
+#define __ARM_COMPUTE_CLPRELU_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLPReLU : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in]  alpha. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..d6ea486d1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYEREX_H__
+#define __ARM_COMPUTE_CLPADLAYEREX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPadLayerKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLPadLayerEx : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]           input     Input tensor. Data types supported:
+   *                                U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out]          output    Output tensor. Data types supported:
+   *                                U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]           pad_size  Tensor for Padding values in NHWC format shape [n, 2],
+   *                                where n is the rank of tensor . Data types supported: S32
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h
new file mode 100644
index 000000000..9a0cc213c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPermuteEx.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLPERMUTEEX_H__
+#define __ARM_COMPUTE_CLPERMUTEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute an @ref CLPermuteKernel. */
+class CLPermuteEx : public ICLSimpleFunction
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in] input  The input tensor to permute. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+   * @param[in] output The output tensor. Data types supported: Same as @p input
+   * @param[in] perm   Permutation vector
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+  /**  Static function to check if given info will lead to a valid configuration of @ref CLPermute.
+   *
+   * @param[in] input  First tensor input info. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: same as @p input.
+   * @param[in] perm   Permutation vector
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PermutationVector &perm);
+};
+}
+#endif /*__ARM_COMPUTE_CLPERMUTEEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
index c1383e21f..b142d3a2e 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
@@ -14,53 +14,61 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLPixelWiseDivision.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLPixelWiseDivision class
+ */
 #ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
 #define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLPixelWiseDivisionKernel. */
+/**
+ * @brief Class to run @ref CLPixelWiseDivisionKernel.
+ */
 class CLPixelWiseDivision : public ICLSimpleFunction
 {
 public:
-  /** Initialise the kernel's inputs, output and convertion policy.
-   *
-   * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+  /**
+   * @brief Initialise the kernel's inputs, output and convertion policy.
+   * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32
    *                                 The input tensor is [in, out] because its TensorInfo might be
    * modified inside the kernel in case of broadcasting of dimension 0.
    * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
    *                                 The input tensor is [in, out] because its TensorInfo might be
    * modified inside the kernel in case of broadcasting of dimension 0.
    * @param[out]     output          The output tensor, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * Note: U8 requires both inputs to be U8.
    * @param[in]      scale           Scale to apply after multiplication.
    *                                 Scale must be positive and its value must be either 1/255 or
-   * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * 1/2^n where n is between 0 and 15.
    * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
    * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
    * even.
+   * @return N/A
    */
   void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
                  ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
                  RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-  /** Static function to check if given info will lead to a valid configuration of @ref
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
    * CLPixelWiseDivision
-   *
-   * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32
    * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
    * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+   * Note: U8 requires both inputs to be U8.
    * @param[in] scale           Scale to apply after multiplication.
    *                            Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+   * where n is between 0 and 15.
    * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
    * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-   *
    * @return a status
    */
   static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
deleted file mode 100644
index 14b473f33..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__
-#define __ARM_COMPUTE_CLREDUCE_MAX_H__
-
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLTopKV2Kernel
- */
-class CLReduceMax : public IFunction
-{
-public:
-  /** Constructor */
-  CLReduceMax();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLReduceMax(const CLReduceMax &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLReduceMax &operator=(const CLReduceMax &) = delete;
-  /** Allow instances of this class to be moved */
-  CLReduceMax(CLReduceMax &&) = default;
-  /** Allow instances of this class to be moved */
-  CLReduceMax &operator=(CLReduceMax &&) = default;
-  /** Initialise the kernel's inputs and outputs.
-   *
-   * @note When locations of min and max occurrences are requested, the reported number of locations
-   * is limited to the given array size.
-   *
-   * @param[in]  input     Input image. Data types supported: F32
-   * @param[in]  axis      Axis to reduce. Data type supported: S32
-   * @param[out] output    indices related to top k values. Data types supported: F32.
-   */
-  void configure(ICLTensor *input, int32_t axis, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLPixelWiseDivision
-   *
-   * @param[in]  input     Input image. Data types supported: F32
-   * @param[in]  axis      Axis to reduce. Data type supported: S32
-   * @param[out] output    indices related to top k values. Data types supported: F32.     *
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  void run_on_cpu();
-
-  int32_t _axis;
-
-  ICLTensor *_input;
-  ICLTensor *_output;
-
-  std::unique_ptr<ICLKernel> _kernel;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
new file mode 100644
index 000000000..e1a6f6ab4
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLReduceOperation.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLReduceOperation class
+ */
+
+#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__
+#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to perform ReduceOperation
+ */
+class CLReduceOperation : public IFunction
+{
+public:
+  /**
+   * @brief Construct a new ReduceOperation object
+   */
+  CLReduceOperation();
+
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor. Data types supported: U8/S32/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[in]  axis     Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in]  op       Reduce operation to perform.
+   * @return N/A
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
+                 ReduceOperation op);
+
+  /**
+   * @brief Static function to check if given info will lead to a valid configuration of @ref
+   *        CLReduceOperation.
+   * @param[in] input   Source tensor info. Data types supported: U8/S32/F32
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: Same as @p
+   *                    input.
+   * @param[in] axis    Axis along which to reduce. It must be sorted and no duplicates.
+   * @param[in] op      Reduce operation to perform.
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const std::set<uint32_t> &axis, const ReduceOperation &op);
+
+  /**
+   * @brief Run the OpenCL kernel for this operation
+   * @return N/A
+   */
+  void run() override;
+
+private:
+  ICLTensor *_input;
+  ICLTensor *_output;
+  std::set<uint32_t> _axis;
+
+  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+  std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
+};
+}
+#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
deleted file mode 100644
index 2081518c1..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__
-#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Perform reduction operation.
- */
-class CLReductionMean : public IFunction
-{
-public:
-  /** Default Constructor.
-   */
-  CLReductionMean();
-
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
-   * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-   * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1
-   */
-  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLReductionMean.
-   *
-   * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
-   * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p
-   * input.
-   * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         std::vector<uint32_t> axis);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  CLReductionMeanKernel _reduction_mean_kernel;
-  CLFillBorderKernel _fill_border_kernel;
-};
-}
-#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
new file mode 100644
index 000000000..7e2df8986
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToBatchNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32.
+ * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape
+ * block_shape, and interleaves these blocks with the "batch" dimension such that in the output.
+ */
+class CLSpaceToBatchND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
+   * @param[in]  input          Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
+   * @param[in]  block_size     Tensor of integer values specifying block sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[in]  padding_size   Tensor of integer values specifying padding sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[out] output         Output tensor. Data types supported: same as @p input.
+   *                            Data layout supported: NCHW/NHWC
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
new file mode 100644
index 000000000..17f762092
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToDepthKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLSpaceToDepth : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h
new file mode 100644
index 000000000..3610ba71c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSquaredDifference.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__
+#define __ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+class CLSquaredDifference : public ICLSimpleFunction
+{
+public:
+  /** Initialise the function's source and destination.
+   *
+   * @param[in]  input1  Source tensor1. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  input2  Source tensor2. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLSQUARED_DIFFERENCE_H__*/
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
deleted file mode 100644
index f223a79be..000000000
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLStridedSliceKernel */
-class CLStridedSlice : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's inputs and outputs
-   *
-   * @param[in]  input  First tensor input. Data type supported:
-   * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
-   * @param[out] output Output tensor. Data type supported: Same as @p input
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-};
-
-class CLStridedSliceCPU : public IFunction
-{
-public:
-  /** Initialise inputs and outputs
-   *
-   * @param[in]  input  First tensor input.
-   * @param[out] output Output tensor.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData,
-                 ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-
-  void run() override;
-
-private:
-  void run_on_cpu();
-
-  ICLTensor *_input;
-  ICLTensor *_output;
-  ICLTensor *_beginData;
-  ICLTensor *_endData;
-  ICLTensor *_stridesData;
-  int32_t _beginMask;
-  int32_t _endMask;
-  int32_t _shrinkAxisMask;
-};
-}
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
new file mode 100644
index 000000000..6b26a85c8
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CLStridedSlice.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+ */
+
+#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class to run @ref CLStridedSliceKernel
+ */
+class CLStridedSliceEx : public ICLSimpleFunction
+{
+public:
+  /**
+   * @brief Initialise the kernel's inputs and outputs
+   * @param[in]  input   Tensor input. Data type supported:
+   *                     U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output  Output tensor. Data type supported: Same as @p input
+   * @param[in]  beginData 'begin' vector of strided slice operation
+   * @param[in]  endData   'end' vector of strided slice operation
+   * @param[in]  stridesData 'strides' vector of strided slice operation
+   * @param[in]  beginMask  If the ith bit is set, begin[i] is ignored
+   * @param[in]  endMask    If the ith bit is set, end[i] is ignored
+   * @param[in]  shrinkAxisMask  If the ith bit is set, the ith specification shrinks the
+   *                             dimensionality by 1, taking on the value at index begin[i]
+   * @return N/A
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                 int32_t shrinkAxisMask);
+};
+}
+#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index 06cd1ee9b..5327e016f 100644
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -14,51 +14,79 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file CLTopKV2.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLTopKV2 class
+ */
 #ifndef __ARM_COMPUTE_CLTOPK_V2_H__
 #define __ARM_COMPUTE_CLTOPK_V2_H__
 
 #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
 
-#include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to execute TopK operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLTopKV2Kernel
+/**
+ * @brief Class to execute TopKV2 operation.
  */
 class CLTopKV2 : public IFunction
 {
 public:
-  /** Constructor */
+  /**
+   * @brief Construct a new CLTopKV2 object
+   */
   CLTopKV2();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
   CLTopKV2(const CLTopKV2 &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   */
   CLTopKV2 &operator=(const CLTopKV2 &) = delete;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Construct a new CLTopKV2 object by using copy constructor
+   * @param[in] CLTopKV2 object to move
+   */
   CLTopKV2(CLTopKV2 &&) = default;
-  /** Allow instances of this class to be moved */
+
+  /**
+   * @brief Assign a CLTopKV2 object.
+   * @param[in] CLTopKV2 object to assign. This object will be moved.
+   */
   CLTopKV2 &operator=(CLTopKV2 &&) = default;
-  /** Initialise the kernel's inputs and outputs.
-   *
-   * @note When locations of min and max occurrences are requested, the reported number of locations
-   * is limited to the given array size.
-   *
+
+  /**
+   * @brief Initialise the kernel's inputs and outputs.
    * @param[in]  input     Input image. Data types supported: U8/S16/F32.
    * @param[in]  k         The value of `k`.
    * @param[out] values    Top k values. Data types supported: S32 if input type is U8/S16, F32 if
    * input type is F32.
-   * @param[out] indices   indices related to top k values. Data types supported: S32 if input type
+   * @param[out] indices   Indices related to top k values. Data types supported: S32 if input type
    * is U8/S16, F32 if input type is F32.
+   * @return N/A
    */
   void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
                  int total_bits = 32, int bits = 4);
 
-  // Inherited methods overridden:
+  /**
+   * @brief Run the kernels contained in the function
+   * Depending on the value of the following environment variables it works differently:
+   *   - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE",
+   *     quick sort on GPU is used.
+   *   - If the value of environment variable "ACL_TOPKV2" == ""GPU"",
+   *     radix sort on GPU is used.
+   *   - For other value, TopKV2 runs on CPU
+   * @return N/A
+   */
   void run() override;
 
 private:
diff --git a/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h
new file mode 100644
index 000000000..fa7408ecd
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to compute a normalization layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEFillBorderKernel
+ * -# @ref NENormalizationLayerKernelEx
+ *
+ */
+class NENormalizationLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                       and an optional 4th dimension for batch of inputs. Data type supported:
+   * F16/F32
+   * @param[out] output    Destination with the same dimensions, data type and number of channels of
+   * @p input
+   * @param[in]  norm_info Normalization layer information like the normalization type,
+   * normalization size and other parameters.
+   */
+  void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NENormalizationLayer
+   *
+   * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions
+   * [width, height, IFM],
+   *                      and an optional 4th dimension for batch of inputs. Data type supported:
+   * F16/F32
+   * @param[in] output    Destination with the same dimensions, data type and number of channels of
+   * @p input
+   * @param[in] norm_info Normalization layer information like the normalization type, normalization
+   * size and other parameters.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const NormalizationLayerInfo &norm_info);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;                        /**< Function memory group */
+  NENormalizationLayerExKernel _norm_kernel;        /**< Normalization layer kernel */
+  NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
+  NEFillBorderKernel _border_handler;               /**< Kernel to handle  borders */
+  Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
+};
+}
+#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYEREX_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index d535c5da4..05ecdeb22 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -38,255 +38,37 @@
 using namespace arm_compute;
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
-    {"absdiff", "absdiff.cl"},
-    {"accumulate", "accumulate.cl"},
-    {"accumulate_squared", "accumulate.cl"},
-    {"accumulate_weighted", "accumulate.cl"},
-    {"activation_layer", "activation_layer.cl"},
-    {"activation_layer_qa8", "activation_layer_qa8.cl"},
-    {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"},
-    {"arithmetic_add", "arithmetic_op.cl"},
-    {"arithmetic_sub", "arithmetic_op.cl"},
+    // ARMComputeEx kernels
+    {"activation_layer_ex", "activation_layer_ex.cl"},
+    {"arg_op", "arg_operation.cl"},
+    {"arithmetic_sub_ex", "arithmetic_op_ex.cl"},
     {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
-    {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"},
-    {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"},
-    {"bitwise_or", "bitwise_op.cl"},
-    {"bitwise_and", "bitwise_op.cl"},
-    {"bitwise_xor", "bitwise_op.cl"},
-    {"bitwise_not", "bitwise_op.cl"},
+    {"batch_to_space_nd", "batch_to_space_nd.cl"},
+    {"binary_logical_op", "binary_logical_op.cl"},
     {"cast", "cast.cl"},
     {"cast_qasymm_in", "cast.cl"},
     {"cast_qasymm_out", "cast.cl"},
-    {"channel_combine_NV", "channel_combine.cl"},
-    {"channel_combine_RGB888", "channel_combine.cl"},
-    {"channel_combine_RGBA8888", "channel_combine.cl"},
-    {"channel_combine_UYVY422", "channel_combine.cl"},
-    {"channel_combine_YUYV422", "channel_combine.cl"},
-    {"channel_shuffle_nchw", "channel_shuffle.cl"},
-    {"channel_extract_NV12", "channel_extract.cl"},
-    {"channel_extract_NV21", "channel_extract.cl"},
-    {"channel_extract_RGB888", "channel_extract.cl"},
-    {"channel_extract_RGBA8888", "channel_extract.cl"},
-    {"channel_extract_UYVY422", "channel_extract.cl"},
-    {"channel_extract_YUYV422", "channel_extract.cl"},
-    {"combine_gradients_L1", "canny.cl"},
-    {"combine_gradients_L2", "canny.cl"},
-    {"concatenate_depth", "concatenate.cl"},
-    {"concatenate_width", "concatenate.cl"},
-    {"convolution_rectangle", "convolution_rectangle.cl"},
-    {"col2im", "col2im.cl"},
-    {"convert_depth_down", "depth_convert.cl"},
-    {"convert_depth_up", "depth_convert.cl"},
-    {"convert_fc_weights", "convert_fc_weights.cl"},
-    {"convolution3x3_static", "convolution3x3.cl"},
-    {"convolution5x5_static", "convolution5x5.cl"},
-    {"convolution7x7_static", "convolution7x7.cl"},
-    {"convolution9x9_static", "convolution9x9.cl"},
-    {"convolution_separable1x5_static", "convolution5x5.cl"},
-    {"convolution_separable5x1_static", "convolution5x5.cl"},
-    {"convolution_separable1x7_static", "convolution7x7.cl"},
-    {"convolution_separable7x1_static", "convolution7x7.cl"},
-    {"convolution_separable1x9_static", "convolution9x9.cl"},
-    {"convolution_separable9x1_static", "convolution9x9.cl"},
-    {"copy_tensor", "copy_tensor.cl"},
-    {"copy_plane", "channel_extract.cl"},
-    {"copy_planes_3p", "channel_combine.cl"},
-    {"copy_to_keypoint", "fast_corners.cl"},
-    {"deconvolution_upsample", "deconvolution_layer.cl"},
-    {"depthwise_convolution_3x3", "depthwise_convolution.cl"},
-    {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"},
-    {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"},
-    {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"},
-    {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"},
-    {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"},
-    {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"},
-    {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"},
-    {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"},
-    {"depthwise_im2col", "depthwise_convolution.cl"},
-    {"depthwise_vector_to_tensor", "depthwise_convolution.cl"},
-    {"depthwise_weights_reshape", "depthwise_convolution.cl"},
-    {"dequantization_layer", "dequantization_layer.cl"},
-    {"derivative", "derivative.cl"},
-    {"dilate", "dilate.cl"},
-    {"direct_convolution1x1", "direct_convolution1x1.cl"},
-    {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"},
-    {"direct_convolution3x3", "direct_convolution3x3.cl"},
-    {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"},
-    {"direct_convolution5x5", "direct_convolution5x5.cl"},
-    {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"},
-    {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
-    {"erode", "erode.cl"},
-    {"fast_corners", "fast_corners.cl"},
-    {"fill_image_borders_constant", "fill_border.cl"},
-    {"fill_image_borders_replicate", "fill_border.cl"},
-    {"finalize", "optical_flow_pyramid_lk.cl"},
-    {"floor_layer", "floor.cl"},
+    {"comparison_op", "comparison_op.cl"},
+    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
+    {"depth_to_space", "depth_to_space.cl"},
+    {"embedding_lookup", "embedding_lookup.cl"},
+    {"exp_layer", "exp.cl"},
     {"gather", "gather.cl"},
     {"gather_1d", "gather.cl"},
     {"gather_1d_out", "gather.cl"},
-    {"gaussian1x5_sub_x", "gaussian_pyramid.cl"},
-    {"gaussian5x1_sub_y", "gaussian_pyramid.cl"},
-    {"gemm_accumulate_biases", "gemm.cl"},
-    {"gemm_interleave4x4", "gemm.cl"},
-    {"gemm_ma_f16", "gemm.cl"},
-    {"gemm_ma_f32", "gemm.cl"},
-    {"gemm_ma_qs8", "gemm.cl"},
-    {"gemm_ma_qs16", "gemm.cl"},
-    {"gemm_mv", "gemv.cl"},
-    {"gemm_mv_quantized", "gemv.cl"},
-    {"gemm_mm_interleaved_transposed_f16", "gemm.cl"},
-    {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"},
-    {"gemm_mm_interleaved_transposed_f32", "gemm.cl"},
-    {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"},
-    {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"},
-    {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"},
-    {"gemm_mm_floating_point", "gemm.cl"},
-    {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"},
-    {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"},
-    {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"},
-    {"gemm_mm_qs8", "gemm.cl"},
-    {"gemm_mm_qs16", "gemm.cl"},
-    {"gemm_lc_vm_f32", "gemm.cl"},
-    {"gemm_transpose1xW", "gemm.cl"},
-    {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"},
-    {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"},
-    {"gemmlowp_mm_bifrost", "gemmlowp.cl"},
-    {"gemmlowp_mm_midgard", "gemmlowp.cl"},
-    {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"},
-    {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"},
-    {"gemmlowp_offset_contribution", "gemmlowp.cl"},
-    {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"},
-    {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"},
-    {"harris_score_3x3", "harris_corners.cl"},
-    {"harris_score_5x5", "harris_corners.cl"},
-    {"harris_score_7x7", "harris_corners.cl"},
-    {"hist_border_kernel", "histogram.cl"},
-    {"hist_border_kernel_fixed", "histogram.cl"},
-    {"hist_local_kernel", "histogram.cl"},
-    {"hist_local_kernel_fixed", "histogram.cl"},
-    {"hog_block_normalization", "hog.cl"},
-    {"hog_detector", "hog.cl"},
-    {"hog_orientation_binning", "hog.cl"},
-    {"hysteresis", "canny.cl"},
-    {"im2col1x1_stridex1_dchw", "im2col.cl"},
-    {"im2col3x3_dchw", "im2col.cl"},
-    {"im2col5x5_dchw", "im2col.cl"},
-    {"im2col11x11_padx0_pady0_dchw", "im2col.cl"},
-    {"im2col_generic_dchw", "im2col.cl"},
-    {"im2col_generic_padx0_pady0_dchw", "im2col.cl"},
-    {"im2col_reduced_dchw", "im2col.cl"},
-    {"init_level", "optical_flow_pyramid_lk.cl"},
-    {"init_level_max", "optical_flow_pyramid_lk.cl"},
-    {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"},
-    {"integral_horizontal", "integral_image.cl"},
-    {"integral_vertical", "integral_image.cl"},
-    {"IYUV_to_NV12_bt709", "color_convert.cl"},
-    {"IYUV_to_RGB888_bt709", "color_convert.cl"},
-    {"IYUV_to_RGBA8888_bt709", "color_convert.cl"},
-    {"IYUV_to_YUV444_bt709", "color_convert.cl"},
-    {"l2_normalize", "l2_normalize.cl"},
-    {"lktracker_stage0", "optical_flow_pyramid_lk.cl"},
-    {"lktracker_stage1", "optical_flow_pyramid_lk.cl"},
-    {"magnitude_phase", "magnitude_phase.cl"},
-    {"mean_stddev_accumulate", "mean_stddev.cl"},
-    {"minmax", "minmaxloc.cl"},
-    {"minmax_border", "minmaxloc.cl"},
-    {"minmax_layer", "minmax_layer.cl"},
-    {"minmaxloc", "minmaxloc.cl"},
-    {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"},
-    {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"},
-    {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"},
-    {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"},
-    {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"},
-    {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"},
-    {"non_max_suppression", "nonmax.cl"},
-    {"normalization_layer_cross_map", "normalization_layer.cl"},
-    {"normalization_layer_in_map", "normalization_layer.cl"},
-    {"NV12_to_IYUV_bt709", "color_convert.cl"},
-    {"NV12_to_RGB888_bt709", "color_convert.cl"},
-    {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
-    {"NV12_to_YUV444_bt709", "color_convert.cl"},
-    {"NV21_to_IYUV_bt709", "color_convert.cl"},
-    {"NV21_to_RGB888_bt709", "color_convert.cl"},
-    {"NV21_to_RGBA8888_bt709", "color_convert.cl"},
-    {"NV21_to_YUV444_bt709", "color_convert.cl"},
-    {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
-    {"permute_201", "permute.cl"},
-    {"permute_120", "permute.cl"},
-    {"permute_3201", "permute.cl"},
-    {"pixelwise_mul_float", "pixelwise_mul_float.cl"},
-    {"pixelwise_mul_int", "pixelwise_mul_int.cl"},
+    {"hashtable_lookup", "hashtable_lookup.cl"},
+    {"neg_tensor", "neg_tensor.cl"},
+    {"pad", "pad.cl"},
+    {"permute_generic", "permute_ex.cl"},
     {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
     {"pixelwise_div_float", "pixelwise_div_float.cl"},
     {"pixelwise_div_int", "pixelwise_div_int.cl"},
-    {"pooling_layer_2", "pooling_layer.cl"},
-    {"pooling_layer_3", "pooling_layer.cl"},
-    {"pooling_layer_optimized_3", "pooling_layer.cl"},
-    {"pooling_layer_7", "pooling_layer.cl"},
-    {"pooling_layer_MxN_nchw", "pooling_layer.cl"},
-    {"pooling_layer_MxN_nhwc", "pooling_layer.cl"},
-    {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"},
-    {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
-    {"quantization_layer", "quantization_layer.cl"},
-    {"reduce_max", "reduce_max.cl"},
-    {"reduction_operation", "reduction_operation.cl"},
-    {"reduction_mean", "reduction_mean.cl"},
-    {"remap_nearest_neighbour", "remap.cl"},
-    {"remap_bilinear", "remap.cl"},
-    {"reshape_layer", "reshape_layer.cl"},
-    {"reshape_to_columns", "convolution_layer.cl"},
-    {"RGB888_to_IYUV_bt709", "color_convert.cl"},
-    {"RGB888_to_NV12_bt709", "color_convert.cl"},
-    {"RGB888_to_RGBA8888_bt709", "color_convert.cl"},
-    {"RGB888_to_YUV444_bt709", "color_convert.cl"},
-    {"RGBA8888_to_IYUV_bt709", "color_convert.cl"},
-    {"RGBA8888_to_NV12_bt709", "color_convert.cl"},
-    {"RGBA8888_to_RGB888_bt709", "color_convert.cl"},
-    {"RGBA8888_to_YUV444_bt709", "color_convert.cl"},
-    {"roi_pooling_layer", "roi_pooling_layer.cl"},
-    {"scale_nearest_neighbour", "scale.cl"},
-    {"scale_bilinear", "scale.cl"},
-    {"scharr3x3", "scharr_filter.cl"},
-    {"sobel3x3", "sobel_filter.cl"},
-    {"sobel_separable5x1", "sobel_filter.cl"},
-    {"sobel_separable1x5", "sobel_filter.cl"},
-    {"sobel_separable7x1", "sobel_filter.cl"},
-    {"sobel_separable1x7", "sobel_filter.cl"},
-    {"softmax_layer_norm", "softmax_layer.cl"},
-    {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"},
-    {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"},
-    {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"},
-    {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"},
-    {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"},
-    {"strided_slice", "strided_slice.cl"},
-    {"suppress_non_maximum", "canny.cl"},
-    {"tablelookup_U8", "tablelookup.cl"},
-    {"tablelookup_S16", "tablelookup.cl"},
-    {"threshold_binary", "threshold.cl"},
-    {"threshold_range", "threshold.cl"},
-    {"transpose", "transpose.cl"},
-    {"UYVY422_to_IYUV_bt709", "color_convert.cl"},
-    {"UYVY422_to_NV12_bt709", "color_convert.cl"},
-    {"UYVY422_to_RGB888_bt709", "color_convert.cl"},
-    {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"},
-    {"warp_affine_nearest_neighbour", "warp_affine.cl"},
-    {"warp_affine_bilinear", "warp_affine.cl"},
-    {"warp_perspective_nearest_neighbour", "warp_perspective.cl"},
-    {"warp_perspective_bilinear", "warp_perspective.cl"},
-    {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"},
-    {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"},
-    {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"},
-    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"},
-    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"},
-    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"},
-    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"},
-    {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"},
-    {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"},
-    {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"},
-    {"YUYV422_to_IYUV_bt709", "color_convert.cl"},
-    {"YUYV422_to_NV12_bt709", "color_convert.cl"},
-    {"YUYV422_to_RGB888_bt709", "color_convert.cl"},
-    {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"},
+    {"prelu", "prelu.cl"},
+    {"prelu_qasymm8", "prelu_quantized.cl"},
+    {"reduce_min_max", "reduce_operation.cl"},
+    {"reduce_sum_mean", "reduce_operation.cl"},
+    {"squared_difference", "squared_difference.cl"},
+    {"strided_slice_ex", "strided_slice_ex.cl"},
     {"topkv2_init", "topkv2.cl"},
     {"topkv2_find_first_negative", "topkv2.cl"},
     {"topkv2_reorder_negatives", "topkv2.cl"},
@@ -296,23 +78,62 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
+    {"space_to_batch_4d_nchw", "space_to_batch.cl"},
+    {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
+    {"space_to_depth", "space_to_depth.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
+        "activation_layer_ex.cl",
+#include "./cl_kernels/activation_layer_ex.clembed"
+    },
+    {
+        "arg_operation.cl",
+#include "./cl_kernels/arg_operation.clembed"
+    },
+    {
+        "arithmetic_op_ex.cl",
+#include "./cl_kernels/arithmetic_op_ex.clembed"
+    },
+    {
+        "batch_to_space_nd.cl",
+#include "./cl_kernels/batch_to_space_nd.clembed"
+    },
+    {
         "cast.cl",
 #include "./cl_kernels/cast.clembed"
     },
     {
-        "fixed_point.h",
-#include "./cl_kernels/fixed_point.hembed"
+        "comparison_op.cl",
+#include "./cl_kernels/comparison_op.clembed"
+    },
+    {
+        "comparison_op_quantized.cl",
+#include "./cl_kernels/comparison_op_quantized.clembed"
+    },
+    {
+        "embedding_lookup.cl",
+#include "./cl_kernels/embedding_lookup.clembed"
+    },
+    {
+        "depth_to_space.cl",
+#include "./cl_kernels/depth_to_space.clembed"
+    },
+    {
+        "exp.cl",
+#include "./cl_kernels/exp.clembed"
     },
     {
         "gather.cl",
 #include "./cl_kernels/gather.clembed"
     },
     {
+        "hashtable_lookup.cl",
+#include "./cl_kernels/hashtable_lookup.clembed"
+    },
+    {
         "helpers.h",
 #include "./cl_kernels/helpers.hembed"
     },
@@ -321,6 +142,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/helpers_asymm.hembed"
     },
     {
+        "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+    },
+    {
+        "neg_tensor.cl",
+#include "./cl_kernels/neg_tensor.clembed"
+    },
+    {
+        "pad.cl",
+#include "./cl_kernels/pad.clembed"
+    },
+    {
         "pixelwise_div_float.cl",
 #include "./cl_kernels/pixelwise_div_float.clembed"
     },
@@ -329,16 +162,32 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/pixelwise_div_int.clembed"
     },
     {
-        "reduce_max.cl",
-#include "./cl_kernels/reduce_max.clembed"
+        "prelu.cl",
+#include "./cl_kernels/prelu.clembed"
+    },
+    {
+        "prelu_quantized.cl",
+#include "./cl_kernels/prelu_quantized.clembed"
+    },
+    {
+        "reduce_operation.cl",
+#include "./cl_kernels/reduce_operation.clembed"
+    },
+    {
+        "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
     },
     {
-        "reduction_mean.cl",
-#include "./cl_kernels/reduction_mean.clembed"
+        "space_to_depth.cl",
+#include "./cl_kernels/space_to_depth.clembed"
     },
     {
-        "strided_slice.cl",
-#include "./cl_kernels/strided_slice.clembed"
+        "squared_difference.cl",
+#include "./cl_kernels/squared_difference.clembed"
+    },
+    {
+        "strided_slice_ex.cl",
+#include "./cl_kernels/strided_slice_ex.clembed"
     },
     {
         "topkv2.cl",
@@ -352,6 +201,11 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
         "topkv2_quicksort.cl",
 #include "./cl_kernels/topkv2_quicksort.clembed"
     },
+    {
+        "permute_ex.cl",
+#include "./cl_kernels/permute_ex.clembed"
+    },
+
 #endif /* EMBEDDED_KERNELS */
 };
 
@@ -359,7 +213,7 @@ CLKernelLibraryEx::CLKernelLibraryEx()
     : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
 {
   opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
-                         // CLKernelLibrary is built
+                         // CLKernelLibraryEx is built
 }
 
 CLKernelLibraryEx &CLKernelLibraryEx::get()
@@ -380,7 +234,7 @@ Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
   }
   std::string concat_str;
 
-  if (fp16_supported(_device))
+  if (fp16_supported())
   {
     concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
   }
@@ -434,6 +288,13 @@ void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
   _built_programs_map.emplace(built_program_name, program);
 }
 
+bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
+
+bool CLKernelLibraryEx::int64_base_atomics_supported() const
+{
+  return device_supports_extension(_device, "cl_khr_int64_base_atomics");
+}
+
 const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
 {
   const auto program_it = _programs_map.find(program_name);
@@ -525,6 +386,7 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con
 
 cl::NDRange CLKernelLibraryEx::default_ndrange() const
 {
+  //    GPUTarget   _target = get_target_from_device(_device);
   cl::Device device = cl::Device::getDefault();
   GPUTarget _target = get_target_from_device(device);
   cl::NDRange default_range;
diff --git a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp
new file mode 100644
index 000000000..cbda169fb
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/OpenCLEx.h"
+
+#include <dlfcn.h>
+#include <iostream>
+
+namespace arm_compute
+{
+CLSymbolsEx &CLSymbolsEx::get()
+{
+  static CLSymbolsEx symbols;
+  return symbols;
+}
+
+bool CLSymbolsEx::load_default()
+{
+  static const std::vector<std::string> libraries{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
+
+  if (_loaded.first)
+  {
+    return _loaded.second;
+  }
+
+  // Indicate that default loading has been tried
+  _loaded.first = true;
+
+  for (const auto &lib : libraries)
+  {
+    if (load(lib))
+    {
+      return true;
+    }
+  }
+
+  std::cerr << "Couldn't find any OpenCL library.\n";
+  return false;
+}
+
+bool CLSymbolsEx::load(const std::string &library)
+{
+  void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+  if (handle == nullptr)
+  {
+    std::cerr << "Can't load " << library << ": " << dlerror() << "\n";
+    // Set status of loading to failed
+    _loaded.second = false;
+    return false;
+  }
+
+#define LOAD_FUNCTION_PTR(func_name, handle) \
+  func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
+
+  LOAD_FUNCTION_PTR(clGetEventInfo, handle);
+  LOAD_FUNCTION_PTR(clSetEventCallback, handle);
+
+#undef LOAD_FUNCTION_PTR
+
+  // Don't call dlclose(handle) or all the symbols will be unloaded !
+
+  // Disable default loading and set status to successful
+  _loaded = std::make_pair(true, true);
+
+  return true;
+}
+
+} // namespace arm_compute
+
+cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size,
+                      void *param_value, size_t *param_value_size_ret)
+{
+  arm_compute::CLSymbolsEx::get().load_default();
+  auto func = arm_compute::CLSymbolsEx::get().clGetEventInfo_ptr;
+  if (func != nullptr)
+  {
+    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+  }
+  else
+  {
+    return CL_OUT_OF_RESOURCES;
+  }
+}
+
+cl_int clSetEventCallback(cl_event event, cl_int command_exec_callback_type,
+                          void(CL_CALLBACK *pfn_ev_notify)(cl_event ev, cl_int ev_cmd_exec_status,
+                                                           void *user_data),
+                          void *user_data)
+{
+  arm_compute::CLSymbolsEx::get().load_default();
+  auto func = arm_compute::CLSymbolsEx::get().clSetEventCallback_ptr;
+  if (func != nullptr)
+  {
+    return func(event, command_exec_callback_type, pfn_ev_notify, user_data);
+  }
+  else
+  {
+    return CL_OUT_OF_RESOURCES;
+  }
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
new file mode 100644
index 000000000..f54c7bde3
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#define CONST_ONE 1.f
+#define DIV_OP(a, b) ((a) / (b))
+#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a)))
+
+// Inverse Square-root Activation
+inline TYPE rsqrt_op(TYPE x)
+{
+    return RSQRT_OP(x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#if defined(ACT)
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer_ex(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+    data = ACTIVATION_OP(ACT, data);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+#endif /* defined(ACT) */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
new file mode 100644
index 000000000..9a6921d7c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform arg_max/arg_min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs for max value index
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+
+__kernel void arg_op(TENSOR4D_DECLARATION(input),
+                     TENSOR4D_DECLARATION(output),
+		     const int axis,
+		     const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    DATA_TYPE tval = value;
+    int idx = 0;
+    for(int i = 1; i < dim; ++i)
+    {
+      indices[axis] = i;
+
+      #if OP_CODE == 1 // ArgMax
+        value = max(value, *((__global DATA_TYPE *)
+		             tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+      #elif OP_CODE == 2 //ArgMin
+        value = min(value, *((__global DATA_TYPE *)
+		    	     tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+      #else
+	return;
+
+      #endif
+
+      if(tval!=value)
+      {
+        idx = indices[axis];
+        tval = value;
+      }
+    }
+
+    *((__global uint *)out.ptr) = idx;
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
new file mode 100644
index 000000000..2ed698951
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+/** This function subtracts one tensors from another.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_sub_ex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 0c0a9ede6..5cd0a4309 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -2,32 +2,20 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers_asymm.h"
 
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif /* FIXED_POINT_POSITION */
-
 #ifdef SATURATE
 #define ADD(x, y) add_sat((x), (y))
 #define SUB(x, y) sub_sat((x), (y))
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
new file mode 100644
index 000000000..ad6a48a02
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
+/** Perform batch to space rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nd(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    out_index[0] = get_global_id(0);//W
+    out_index[1] = get_global_id(1);//H
+    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
+    out_index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+    in_index[0] = out_index[0]/BLOCK_SIZE1;
+    in_index[1] = out_index[1]/BLOCK_SIZE0;
+    in_index[2] = out_index[2];
+    in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
new file mode 100644
index 000000000..bea61f53e
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(OP_CODE) && defined(DATA_TYPE)
+/** returns truth value of the two input tensors for BINARY LOGICAL OP.
+ * where BINARY LOGICAL OP can be AND, OR.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor.Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ */
+__kernel void binary_logical_op(
+    TENSOR3D_DECLARATION(input1),
+    TENSOR3D_DECLARATION(input2),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
+    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    #if OP_CODE == 1 // LOGICAL AND
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+	     (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+
+    #elif OP_CODE == 2 // LOGICAL OR
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+	     (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+
+    #else // OP NOT SUPPORTED
+     return
+
+    #endif
+}
+#endif //if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
index 113804cca..3d4675e5d 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -2,38 +2,34 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
-#ifndef SCALE_IN
-#define SCALE_IN    1.0f
+#ifndef SCALE
+#define SCALE    1.0f
+#endif
+#ifndef OFFSET
+#define OFFSET   0
 #endif
-#ifndef OFFSET_IN
-#define OFFSET_IN   0
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
 #endif
 
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
 /** Perform a cast operation on an input tensor.
  *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -65,9 +61,9 @@ __kernel void cast(
                      0, (__global DATA_TYPE_OUT *)output.ptr);
 }
 
-
 /** Perform a cast operation on an QASYMM8 input tensor.
- *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -96,8 +92,8 @@ __kernel void cast_qasymm_in(
 
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
         VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
 
     VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
     VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
@@ -108,7 +104,8 @@ __kernel void cast_qasymm_in(
 
 
 /** Perform a cast operation on an QASYMM8 output tensor.
- *
+ * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+ * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -137,8 +134,8 @@ __kernel void cast_qasymm_out(
 
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
         VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN);
+    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
 
     VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
     VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
@@ -146,3 +143,4 @@ __kernel void cast_qasymm_out(
     VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
                      0, (__global DATA_TYPE_OUT *)output.ptr);
 }
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
new file mode 100644
index 000000000..765072556
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
+/** Returns truth value of comparison operators.
+ * Comparison operators may be equal, not_equal etc.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT,
+ * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void comparison_op(
+    TENSOR3D_DECLARATION(input1),
+    TENSOR3D_DECLARATION(input2),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
+    Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    #if OP_CODE == 1 //EQUAL
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+             (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr);
+
+    #elif OP_CODE == 2 //NOT_EQUAL
+     VSTORE(VEC_SIZE)
+     (CONVERT(VLOAD(VEC_SIZE)
+             (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
+             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+
+    #else // OP NOT SUPPORTED
+     return;
+
+    #endif
+}
+#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
new file mode 100644
index 000000000..1eb305f7b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#ifndef VEC_SIZE
+#define VEC_SIZE  1
+#endif
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+
+/** Returns the truth value of comparison .
+ * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void comparison_op_qasymm8(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+    #if OPCODE == 1 //EQUAL QUANTIZED
+      VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
+
+    #elif OPCODE == 2 //NOT EQUAL QUANTIZED
+      VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
+
+    #else // OP NOT SUPPORTED
+      return;
+
+    #endif
+}
+#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
new file mode 100644
index 000000000..fef2243e7
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void depth_to_space(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    out_index[0] = get_global_id(0);//W
+    out_index[1] = get_global_id(1);//H
+    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
+    out_index[3] = get_global_id(2) / DEPTH_OUT;//B
+
+    in_index[0] = out_index[0]/BLOCK_SIZE;
+    in_index[1] = out_index[1]/BLOCK_SIZE;
+    in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+    in_index[3] = out_index[3];
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
new file mode 100644
index 000000000..348458fe9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ */
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+    //lookup ids for based on the tensor dimensions
+    int lup_id[4] = {0};
+
+    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+	                       :get_global_id(0);
+    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+	                       :get_global_id(1);
+    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+	                       :get_global_id(2)%DEPTH_OUT;
+    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                               :get_global_id(2) / DEPTH_OUT;
+
+    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                     0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
new file mode 100644
index 000000000..69d94f30a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Perform an exponential operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void exp_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+    (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
deleted file mode 100644
index 7807533e2..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h
+++ /dev/null
@@ -1,565 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_FIXED_POINT_H
-#define ARM_COMPUTE_FIXED_POINT_H
-
-#define TYPE_ALIAS(type, alias) \
-  typedef type alias;           \
-  typedef type alias##x##1;     \
-  typedef type##2 alias##x##2;  \
-  typedef type##3 alias##x##3;  \
-  typedef type##4 alias##x##4;  \
-  typedef type##8 alias##x##8;  \
-  typedef type##16 alias##x##16;
-
-TYPE_ALIAS(char, qs8)
-TYPE_ALIAS(short, qs16)
-TYPE_ALIAS(int, qs32)
-
-#define qs8_MIN ((char)CHAR_MIN)
-#define qs8_MAX ((char)CHAR_MAX)
-#define qs16_MIN ((short)SHRT_MIN)
-#define qs16_MAX ((short)SHRT_MAX)
-#define qs32_MIN ((int)INT_MIN)
-#define qs32_MAX ((int)INT_MAX)
-
-#define qu8_MIN ((uchar)0)
-#define qu8_MAX ((uchar)UCHAR_MAX)
-#define qu16_MIN ((ushort)0)
-#define qu16_MAX ((ushort)USHRT_MAX)
-#define qu32_MIN ((uint)0)
-#define qu32_MAX ((uint)UINT_MAX)
-
-#define qs8_TYPE char
-#define qs8x1_TYPE char
-#define qs8x2_TYPE char2
-#define qs8x3_TYPE char3
-#define qs8x4_TYPE char4
-#define qs8x8_TYPE char8
-#define qs8x16_TYPE char16
-
-#define qs16_TYPE short
-#define qs16x1_TYPE short
-#define qs16x2_TYPE short2
-#define qs16x3_TYPE short3
-#define qs16x4_TYPE short4
-#define qs16x8_TYPE short8
-#define qs16x16_TYPE short16
-
-#define qs32_TYPE int
-#define qs32x1_TYPE int
-#define qs32x2_TYPE int2
-#define qs32x3_TYPE int3
-#define qs32x4_TYPE int4
-#define qs32x8_TYPE int8
-#define qs32x16_TYPE int16
-
-/* All internal constants are represented in the maximum supported fixed point format (QS16),
- * thus we define an additional shift parameter required to convert the constant
- * from the maximum supported format to the require one.
- */
-#define qs8_SHIFT 8
-#define qs16_SHIFT 0
-
-#undef VEC_DATA_TYPE_STR
-#undef VEC_DATA_TYPE
-#undef CONVERT_STR
-#undef CONVERT
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#define VEC_DATA_TYPE_STR(type, size) type##x##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
-#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
-#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
-#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
-#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-/** Computes saturating absolute value of fixed point vector.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point absolute value.
- */
-#define ABSQ_SAT_IMPL(type) \
-  inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); }
-
-ABSQ_SAT_IMPL(qs8x16)
-ABSQ_SAT_IMPL(qs16x8)
-
-#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
-#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
-
-/** Computes max of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point maximum.
- */
-#define MAXQ_IMPL(type) \
-  inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); }
-
-MAXQ_IMPL(qs8x1)
-MAXQ_IMPL(qs8x2)
-MAXQ_IMPL(qs8x4)
-MAXQ_IMPL(qs8x8)
-MAXQ_IMPL(qs8x16)
-MAXQ_IMPL(qs16x1)
-MAXQ_IMPL(qs16x2)
-MAXQ_IMPL(qs16x4)
-MAXQ_IMPL(qs16x8)
-MAXQ_IMPL(qs16x16)
-
-#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
-#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated addition of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point addition. The result is saturated in case of overflow
- */
-#define ADDQ_SAT_IMPL(type) \
-  inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); }
-
-ADDQ_SAT_IMPL(qs8x1)
-ADDQ_SAT_IMPL(qs8x2)
-ADDQ_SAT_IMPL(qs8x4)
-ADDQ_SAT_IMPL(qs8x8)
-ADDQ_SAT_IMPL(qs8x16)
-ADDQ_SAT_IMPL(qs16x1)
-ADDQ_SAT_IMPL(qs16x2)
-ADDQ_SAT_IMPL(qs16x4)
-ADDQ_SAT_IMPL(qs16x8)
-ADDQ_SAT_IMPL(qs16x16)
-ADDQ_SAT_IMPL(qs32x1)
-ADDQ_SAT_IMPL(qs32x2)
-ADDQ_SAT_IMPL(qs32x4)
-ADDQ_SAT_IMPL(qs32x8)
-ADDQ_SAT_IMPL(qs32x16)
-
-#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
-#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated subtraction of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point subtraction. The result is saturated in case of overflow
- */
-#define SUBQ_SAT_IMPL(type) \
-  inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); }
-
-SUBQ_SAT_IMPL(qs8x1)
-SUBQ_SAT_IMPL(qs8x2)
-SUBQ_SAT_IMPL(qs8x4)
-SUBQ_SAT_IMPL(qs8x8)
-SUBQ_SAT_IMPL(qs8x16)
-SUBQ_SAT_IMPL(qs16x1)
-SUBQ_SAT_IMPL(qs16x2)
-SUBQ_SAT_IMPL(qs16x4)
-SUBQ_SAT_IMPL(qs16x8)
-SUBQ_SAT_IMPL(qs16x16)
-
-#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
-#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/* Multiply of two fixed point numbers
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication.
- */
-#define MULQ_IMPL(type, itype)                                               \
-  inline type mul_##type(type VopA, type VopB, int fixed_point_position)     \
-  {                                                                          \
-    itype round_val = (itype)(1 << (fixed_point_position - 1));              \
-    itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
-    return CONVERT((res >> (itype)fixed_point_position), type);              \
-  }
-
-MULQ_IMPL(qs8x8, qs16x8)
-MULQ_IMPL(qs16x8, qs32x8)
-MULQ_IMPL(qs8x16, qs16x16)
-MULQ_IMPL(qs16x16, qs32x16)
-
-#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
-#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
-
-/* Saturate multiply of two fixed point numbers
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication. The result is saturated in case of overflow
- */
-#define MULQ_SAT_IMPL(type, itype)                                                  \
-  inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position)        \
-  {                                                                                 \
-    itype round_val = (itype)(1 << (fixed_point_position - 1));                     \
-    itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
-    return CONVERT_SAT((res >> (itype)fixed_point_position), type);                 \
-  }
-
-MULQ_SAT_IMPL(qs8x1, qs16x1)
-MULQ_SAT_IMPL(qs8x2, qs16x2)
-MULQ_SAT_IMPL(qs8x3, qs16x3)
-MULQ_SAT_IMPL(qs8x4, qs16x4)
-MULQ_SAT_IMPL(qs8x8, qs16x8)
-MULQ_SAT_IMPL(qs8x16, qs16x16)
-MULQ_SAT_IMPL(qs16x1, qs32x1)
-MULQ_SAT_IMPL(qs16x2, qs32x2)
-MULQ_SAT_IMPL(qs16x3, qs32x3)
-MULQ_SAT_IMPL(qs16x4, qs32x4)
-MULQ_SAT_IMPL(qs16x8, qs32x8)
-MULQ_SAT_IMPL(qs16x16, qs32x16)
-
-#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \
-  mul_sat_##type##x##size((a), (b), (position))
-#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \
-  MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate multiply-accumulate
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate. The result is saturated in case of
- * overflow
- */
-#define MLAQ_SAT_IMPL(type, itype)                                               \
-  type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
-  {                                                                              \
-    itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype),              \
-                        (itype)(1 << (fixed_point_position - 1)));               \
-    return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
-  }
-
-MLAQ_SAT_IMPL(qs8x8, qs16x8)
-MLAQ_SAT_IMPL(qs8x16, qs16x16)
-MLAQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
-  mla_sat_##type##x##size((a), (b), (c), (position))
-#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \
-  MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate multiply-accumulate long
- *
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate long. The result is saturated in case
- * of overflow
- */
-#define MLALQ_SAT_IMPL(type, itype)                                                 \
-  itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
-  {                                                                                 \
-    itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype),                 \
-                        (itype)(1 << (fixed_point_position - 1)));                  \
-    return add_sat(VopA, res >> (itype)fixed_point_position);                       \
-  }
-
-MLALQ_SAT_IMPL(qs8x8, qs16x8)
-MLALQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \
-  mlal_sat_##type##x##size((a), (b), (c), (position))
-#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \
-  MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate division of two fixed point vectors
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point division. The result is saturated in case of overflow
- */
-#define DIVQ_SAT_IMPL(stype, type, itype)                                                          \
-  inline type div_sat_##type(type VopA, type VopB, int fixed_point_position)                       \
-  {                                                                                                \
-    itype conv_a = CONVERT((VopA), itype);                                                         \
-    itype denominator = CONVERT((VopB), itype);                                                    \
-    itype numerator = conv_a << (itype)(fixed_point_position);                                     \
-    itype res = select((itype)(numerator / denominator),                                           \
-                       select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \
-                       (itype)(denominator == (itype)0));                                          \
-    return CONVERT_SAT((res), type);                                                               \
-  }
-
-DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
-DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
-DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
-DIVQ_SAT_IMPL(qs8, qs8, qs16)
-DIVQ_SAT_IMPL(qs16, qs16, qs32)
-
-#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
-#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
-
-#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \
-  div_sat_##type##x##size((a), (b), (position))
-#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \
-  DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate exponential of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the exponential function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point exponential. The result is saturated in case of overflow
- */
-#define EXPQ_IMPL(stype, type, size)                                                            \
-  inline type exp_sat_##type(type VopA, int fixed_point_position)                               \
-  {                                                                                             \
-    type const_one = (type)(1 << (fixed_point_position));                                       \
-    type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1);                    \
-    type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one;    \
-    type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1);                        \
-    type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position);               \
-    type dec_m = m >> (type)fixed_point_position;                                               \
-    type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size,       \
-                                   fixed_point_position);                                       \
-    alpha = CONVERT(abs_diff(VopA, alpha), type);                                               \
-    type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C);      \
-    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B);         \
-    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A);         \
-    sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
-    return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0),      \
-                  clz(sum) > dec_m); /* Saturate result if needed */                            \
-  }
-
-EXPQ_IMPL(qs8, qs8x2, 2)
-EXPQ_IMPL(qs8, qs8x4, 4)
-EXPQ_IMPL(qs8, qs8x8, 8)
-EXPQ_IMPL(qs8, qs8x16, 16)
-EXPQ_IMPL(qs16, qs16x2, 2)
-EXPQ_IMPL(qs16, qs16x4, 4)
-EXPQ_IMPL(qs16, qs16x8, 8)
-EXPQ_IMPL(qs16, qs16x16, 16)
-
-#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
-#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate logarithm of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point logarithm. The result is saturated in case of overflow
- */
-#define LOGQ_IMPL(stype, type, size)                                                            \
-  inline type log_sat_##type(type VopA, int fixed_point_position)                               \
-  {                                                                                             \
-    type const_one = (type)(1 << (fixed_point_position));                                       \
-    type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */                   \
-    type A = (type)(0x5C0F >> (14 - fixed_point_position));   /* 1.4384189 */                   \
-    type B = -(type)(0x56AE >> (15 - fixed_point_position));  /* -0.6771900 */                  \
-    type C = (type)(0x2933 >> (15 - fixed_point_position));   /* 0.3218538 */                   \
-    type D = -(type)(0x0AA7 >> (15 - fixed_point_position));  /* -0.0832229 */                  \
-    type inter_a =                                                                              \
-        select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \
-               VopA < const_one);                                                               \
-    type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position);   \
-    inter_a = inter_a >> shift_val;                                                             \
-    inter_a = sub_sat(inter_a, const_one);                                                      \
-    type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C);    \
-    sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B);       \
-    sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A);       \
-    sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position);                   \
-    sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype,  \
-                            size, fixed_point_position);                                        \
-    return select(select(sum, -sum, VopA < const_one), (type)0,                                 \
-                  VopA < (type)0); /* Saturate result if needed */                              \
-  }
-
-LOGQ_IMPL(qs8, qs8x16, 16)
-LOGQ_IMPL(qs16, qs16x8, 8)
-LOGQ_IMPL(qs16, qs16x16, 16)
-
-#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
-#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate inverse square root of a fixed point vector
- *
- * @note Implemented approach uses Newton's method to approximate the inverse square root function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point inverse square root. The result is saturated in case of
- * overflow
- */
-#define INVSQRTQ_IMPL(stype, type, size)                                                           \
-  inline type invsqrt_sat_##type(type VopA, int fixed_point_position)                              \
-  {                                                                                                \
-    type const_three = (type)(3 << (fixed_point_position));                                        \
-    type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position);      \
-    type temp = select((type)(VopA >> shift_value),                                                \
-                       select((type)stype##_MAX, (type)(VopA << (-shift_value)),                   \
-                              (type)(clz(VopA) > (-shift_value))),                                 \
-                       (type)(shift_value < (type)0));                                             \
-    type x = temp;                                                                                 \
-    x = MUL_SAT_OP_EXPAND(                                                                         \
-            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
-                                                                        fixed_point_position),     \
-                                                      temp, stype, size, fixed_point_position)),   \
-            stype, size, fixed_point_position) >>                                                  \
-        1;                                                                                         \
-    x = MUL_SAT_OP_EXPAND(                                                                         \
-            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
-                                                                        fixed_point_position),     \
-                                                      temp, stype, size, fixed_point_position)),   \
-            stype, size, fixed_point_position) >>                                                  \
-        1;                                                                                         \
-    x = MUL_SAT_OP_EXPAND(                                                                         \
-            x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,         \
-                                                                        fixed_point_position),     \
-                                                      temp, stype, size, fixed_point_position)),   \
-            stype, size, fixed_point_position) >>                                                  \
-        1;                                                                                         \
-    if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */                  \
-    {                                                                                              \
-      x = MUL_SAT_OP_EXPAND(                                                                       \
-              x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,       \
-                                                                          fixed_point_position),   \
-                                                        temp, stype, size, fixed_point_position)), \
-              stype, size, fixed_point_position) >>                                                \
-          1;                                                                                       \
-      x = MUL_SAT_OP_EXPAND(                                                                       \
-              x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size,       \
-                                                                          fixed_point_position),   \
-                                                        temp, stype, size, fixed_point_position)), \
-              stype, size, fixed_point_position) >>                                                \
-          1;                                                                                       \
-    }                                                                                              \
-    type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0);      \
-    return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2),  \
-                                                    (type)(clz(x) > shift_value2)),                \
-                  (type)(shift_value < (type)0)); /* Saturate result if needed */                  \
-  }
-
-INVSQRTQ_IMPL(qs8, qs8x1, 1)
-INVSQRTQ_IMPL(qs16, qs16x1, 1)
-INVSQRTQ_IMPL(qs8, qs8x16, 16)
-INVSQRTQ_IMPL(qs16, qs16x8, 8)
-
-#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
-#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate hyperbolic tangent of a fixed point vector
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type  the actual data type.
- * @param[in] size  the number of the calculated elements.
- *
- * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of
- * overflow
- */
-#define TANHQ_IMPL(stype, type, size)                                                        \
-  inline type tanh_sat_##type(type VopA, int fixed_point_position)                           \
-  {                                                                                          \
-    type const_one = (type)(1 << (fixed_point_position));                                    \
-    type const_two = (type)(2 << (fixed_point_position));                                    \
-    type exp2x =                                                                             \
-        EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \
-                      stype, size, fixed_point_position);                                    \
-    type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size);                             \
-    type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size);                             \
-    return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position);               \
-  }
-
-TANHQ_IMPL(qs8, qs8x16, 16)
-TANHQ_IMPL(qs16, qs16x8, 8)
-
-#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
-#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
-
-#define floatx16 float16
-#define float16_TYPE float16
-
-#define CONVERTQ_DOWN_IMPL(in_type, out_type)                                           \
-  inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position)   \
-  {                                                                                     \
-    return CONVERT(a * (1 << fixed_point_position) +                                    \
-                       select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
-                   out_type);                                                           \
-  }
-
-CONVERTQ_DOWN_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_IMPL(float16, qs16x16)
-
-#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type)                                           \
-  inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
-  {                                                                                         \
-    return CONVERT_SAT(a * (1 << fixed_point_position) +                                    \
-                           select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \
-                       out_type);                                                           \
-  }
-
-CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
-
-#define CONVERTQ_UP_IMPL(in_type, out_type)                                           \
-  inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
-  {                                                                                   \
-    return CONVERT(a, out_type) / (1 << fixed_point_position);                        \
-  }
-
-CONVERTQ_UP_IMPL(qs8x16, float16)
-CONVERTQ_UP_IMPL(qs16x16, float16)
-
-#define SQCVT_SAT_IMPL(type)                                                                \
-  inline type sqcvt_##type##_sat(float a, int fixed_point_position)                         \
-  {                                                                                         \
-    return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
-  }
-
-SQCVT_SAT_IMPL(qs8)
-SQCVT_SAT_IMPL(qs16)
-
-#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
-#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
-
-#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
index 25e20f5f2..6b767d6c9 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
new file mode 100644
index 000000000..ed7409852
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
+ * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
+ * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
+ */
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
+{
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
+
+    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
+
+    int lup_id[4] = {0};
+
+    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+	                       :get_global_id(0);
+    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+	                       :get_global_id(1);
+    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+	                       :get_global_id(2)%DEPTH_OUT;
+    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                               :get_global_id(2) / DEPTH_OUT;
+
+    if (lup_id[NUM_DIMS-1] < 0)
+    {
+      VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
+      return;
+    }
+
+    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
+
+    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                     0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 8143d2398..0e123ae0a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -24,15 +24,23 @@
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-#if defined(cl_arm_printf)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+    defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 #pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf)
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 
 #define EXPAND(x) x
 
@@ -175,7 +183,7 @@ typedef struct Tensor4D
  *
  * @return An image object
  */
-Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
                                          uint stride_x, uint step_x)
 {
   Vector vector = {
@@ -201,7 +209,7 @@ Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
  *
  * @return An image object
  */
-Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
                                        uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
   Image img = {.ptr = ptr,
@@ -230,7 +238,7 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
  *
  * @return A 3D tensor object
  */
-Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
                                                      uint offset_first_element_in_bytes,
                                                      uint stride_x, uint step_x, uint stride_y,
                                                      uint step_y, uint stride_z, uint step_z)
@@ -261,7 +269,7 @@ Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
  *
  * @return A 3D tensor object
  */
-Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
                                              uint offset_first_element_in_bytes, uint stride_x,
                                              uint step_x, uint stride_y, uint step_y, uint stride_z,
                                              uint step_z)
@@ -276,7 +284,7 @@ Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr,
   return tensor;
 }
 
-Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
                                              uint offset_first_element_in_bytes, uint stride_x,
                                              uint step_x, uint stride_y, uint step_y, uint stride_z,
                                              uint step_z, uint stride_w, uint step_w, uint mod_size)
@@ -299,7 +307,7 @@ Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr,
  * @param[in] vec Pointer to the starting position of the buffer
  * @param[in] x   Relative X position
  */
-__global inline const uchar *vector_offset(const Vector *vec, int x)
+inline __global const uchar *vector_offset(const Vector *vec, int x)
 {
   return vec->ptr + x * vec->stride_x;
 }
@@ -310,7 +318,7 @@ __global inline const uchar *vector_offset(const Vector *vec, int x)
  * @param[in] x   Relative X position
  * @param[in] y   Relative Y position
  */
-__global inline uchar *offset(const Image *img, int x, int y)
+inline __global uchar *offset(const Image *img, int x, int y)
 {
   return img->ptr + x * img->stride_x + y * img->stride_y;
 }
@@ -322,7 +330,7 @@ __global inline uchar *offset(const Image *img, int x, int y)
  * @param[in] y      Relative Y position
  * @param[in] z      Relative Z position
  */
-__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
 {
   return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
 }
@@ -335,7 +343,7 @@ __global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int
  * @param[in] z      Relative Z position
  * @param[in] w      Relative W position
  */
-__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
 {
   return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
          w * tensor->stride_w;
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
new file mode 100644
index 000000000..e3aa463db
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Performs a negation of input tensor.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: S16/S32/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void neg_tensor(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+    (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
new file mode 100644
index 000000000..ecf4696e9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ *
+ * @param[in]  pad_values                           Padding values for each of the dimensions. Only pad values for Up(for
+ *                                                  batch), Top(for height), Left(for width) and Front(for depth) are
+ *				                    required. Supported data type: S32
+ */
+
+__kernel void pad(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output),
+     const int4 pad_values)
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int index[4]={0};
+
+    index[0] = get_global_id(0);//W
+    index[1] = get_global_id(1);//H
+    index[2] = get_global_id(2) % DEPTH_OUT;//C
+    index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+    if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
+        index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
+        index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
+        index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
+    {
+        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+    }
+    else
+    {
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
+                                              tensor4D_offset(&in, index[0] - pad_values.x,
+                                                              index[1] - pad_values.y,
+                                                              index[2] - pad_values.z,
+                                                              index[3] - pad_values.w));
+    }
+ }
+
+#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
new file mode 100644
index 000000000..7cc8b0354
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/** Perform a Generic permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1
+6/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in b
+ytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in b
+ytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in b
+ytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute_generic(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+   int out_index[4];
+   int in_index[4];
+   in_index[0] = get_global_id(0);//W
+   in_index[1] = get_global_id(1);//H
+   in_index[2] = get_global_id(2) % DEPTH_IN;//C
+   in_index[3] = get_global_id(2) / DEPTH_IN;//B
+   out_index[0] = in_index[P1];
+   out_index[1] = in_index[P2];
+   out_index[2] = in_index[P3];
+   out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
index 512c62023..aa05121b1 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
index 82edf3b1d..fdfb78003 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
@@ -2,40 +2,20 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers.h"
 
-#if defined(FIXED_POINT_POSITION)
-
-#include "fixed_point.h"
-
-#if defined(SATURATE)
-#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#else // SATURATE
-#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#endif // SATURATE
-
-#else // FIXED_POINT_POSITION
-
 #if defined(SATURATE)
 #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
 #else // SATURATE
@@ -45,17 +25,14 @@
 
 #define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
 
-#endif // FIXED_POINT_POSITION
-
 /** Performs a pixelwise division with integer scale of integer inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
  * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
  * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -79,7 +56,7 @@
  * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32
  */
 __kernel void pixelwise_div_int(
     TENSOR3D_DECLARATION(in1),
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index ddc9d5a27..ab1307e64 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2016, 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "helpers_asymm.h"
 
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
new file mode 100644
index 000000000..68da2ba32
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : F16/F32
+ * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void prelu(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(alpha),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D alpha  = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+      (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ?
+                       VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) :
+	               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+       0, (__global DATA_TYPE *)output.ptr);
+
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
new file mode 100644
index 000000000..7e97b7ed6
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Returns result of prelu function implemented as below:
+ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take uchar data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : QASYMM8
+ * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void prelu_qasymm8(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(alpha),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
+    const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global uchar *)output.ptr);
+}
+
+#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
deleted file mode 100644
index dfa3b85f4..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(WIDTH)
-/** Perform reduce max
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types:  F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[out] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[out] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reduce_max(VECTOR_DECLARATION(input),
-                         VECTOR_DECLARATION(output))
-{
-    Vector input = CONVERT_TO_VECTOR_STRUCT(input);
-    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
-
-    __global float *input_addr = (__global float *)(input.ptr);
-    __global float *output_addr = (__global float *)(output.ptr);
-
-    float max_value = *input_addr;
-    for(int x = 1; x < WIDTH; x++)
-    {
-        float value = *(input_addr + x);
-        max_value = max(value, max_value);
-    }
-
-    // Store max
-    *output_addr = max_value;
-}
-#endif // defined(WIDTH)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
new file mode 100644
index 000000000..8bef49363
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+/** Perform reduce max/min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
+                             TENSOR4D_DECLARATION(output),
+                             const int axis,
+                             const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    for(int i = 1; i < dim; ++i)
+    {
+      indices[axis] = i;
+
+      #if OP_CODE == 1 // REDUCE_MAX
+       value = max(value, *((__global DATA_TYPE *)
+			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+
+      #elif OP_CODE == 2 // REDUCE_MIN
+       value = min(value, *((__global DATA_TYPE *)
+			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+
+      #else // OP NOT SUPPORTED
+       return;
+
+      #endif
+    }
+
+    *((__global DATA_TYPE *)out.ptr) = value;
+}
+
+/** Perform reduce sum/mean
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
+ * -DOP_CODE = number. e.g. -DOP_CODE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
+                              TENSOR4D_DECLARATION(output),
+                              const int axis,
+                              const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE sum_value = (DATA_TYPE)0;
+    for(int i = 0; i < dim; ++i)
+    {
+      indices[axis] = i;
+      sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    }
+
+    #if OP_CODE == 3 // REDUCE_SUM
+     *((__global DATA_TYPE *)out.ptr) = sum_value;
+
+    #elif OP_CODE == 4 // REDUCE_MEAN
+     *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
+
+    #else // OP NOT SUPPORTED
+     return;
+
+    #endif
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
deleted file mode 100644
index 1a96eea61..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-inline DATA_TYPE sum_8(__global const DATA_TYPE *input)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in = vload8(0, input);
-    in.s0123 += in.s4567;
-    in.s01 += in.s23;
-    return ((in.s0 + in.s1));
-}
-
-/** This function calculates the sum and sum of squares of a given input image.
- *
- * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] local_sum                         Local sum of all elements
- * @param[in]  height                            Height of the input image
- * @param[in]  divider                           Divider to calculate mean
- */
-__kernel void reduction_mean(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __local DATA_TYPE *local_sums, 
-    int height,
-    int divider)
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    float8 tmp_sum = 0;
-    // Calculate partial sum
-
-    for(int i = 0; i < height; i++)
-    {
-        local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i));
-    }
-    ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider;
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 000000000..a0fc2d5a9
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+/** Perform space to batch with input of 4D and NCHW format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr));
+    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+    in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[2] = get_global_id(2) % DEPTH_OUT;
+    in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+
+    if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+    {
+        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+    }
+    else
+    {
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+    }
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr));
+    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * VEC_SIZE;
+    in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
+    in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+    if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+    {
+        VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+    }
+    else
+    {
+        VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
+                                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                         0, (__global DATA_TYPE *)out.ptr);
+    }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
new file mode 100644
index 000000000..f6977045a
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_depth(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    in_index[0] = get_global_id(0);//W
+    in_index[1] = get_global_id(1);//H
+    in_index[2] = get_global_id(2) % DEPTH_IN;//C
+    in_index[3] = get_global_id(2) / DEPTH_IN;//B
+
+    out_index[0] = in_index[0]/BLOCK_SIZE;
+    out_index[1] = in_index[1]/BLOCK_SIZE;
+    out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+    out_index[3] = in_index[3];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
new file mode 100644
index 000000000..3e1a5c97f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#ifndef VEC_SIZE
+#define VEC_SIZE 1
+#endif
+
+#if defined(DATA_TYPE)
+/** Returns true value of squared_difference of two tensors.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input1_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[in]  input2_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source image
+ *
+ * @param[out] output_ptr                            Pointer to the destination image. Supported data types: F16/F32
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void squared_difference(
+    TENSOR3D_DECLARATION(input1),
+    TENSOR3D_DECLARATION(input2),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
+    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sq_diff = diff * diff;
+
+    VSTORE(VEC_SIZE)
+    (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
deleted file mode 100644
index c5ff82f9e..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-
-inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w)
-{
-    int stride_x = vector->stride_x;
-    int stride_y = stride_x * dim_x;
-    int stride_z = stride_y * dim_y;
-    int stride_w = stride_z * dim_z;
-    Tensor4D tensor =
-    {
-        .ptr                           = vector->ptr,
-        .offset_first_element_in_bytes = vector->offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w,
-    };
-    return tensor;
-}
-
-/** Extracts a strided slice up to 4-dimensions
- *
- * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
- * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dims_in                              The 4-dimensional dimension of the input. Supported data types: S32
- * @param[in]  dims_out                             The 4-dimensional dimension of the output. Supported data types: S32
- * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
- * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
- */
-__kernel void strided_slice(VECTOR_DECLARATION(input),
-                            VECTOR_DECLARATION(output),
-                            const int4 dims_in,
-                            const int4 dims_out,
-                            const int4 starts,
-                            const int4 strides)
-{
-    // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset
-    Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
-    Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-
-    // Implemenation
-    // Infer a Tensor4D from output Vector and output's dimensions info
-    // Infer a Tensor4D from input Vector and input's dimensions info
-    // Infer indices of output as 4D from the offset of output vector
-    // Infer indices of input as 4D from indices of output
-    // out(offset of output vector) = in(offset of input)
-
-    Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
-    Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
-
-    // Must be output_step_x == output_stride_x == an element's size
-    const int offset_out = get_global_id(0) * output_stride_x;
-    int4 indices_out =
-    {
-            get_global_id(0) % dims_out.x,
-            (offset_out / tensor_out.stride_y) % dims_out.y,
-            (offset_out / tensor_out.stride_z) % dims_out.z,
-            (offset_out / tensor_out.stride_w) % dims_out.w,
-    };
-
-    int4 indices_in =
-    {
-            starts.x + (strides.x * indices_out.x),
-            starts.y + (strides.y * indices_out.y),
-            starts.z + (strides.z * indices_out.z),
-            starts.w + (strides.w * indices_out.w),
-    };
-
-    *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
new file mode 100644
index 000000000..b39c55b96
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
+/** Extracts a strided slice up to 4-dimensions
+ *
+ * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
+ * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
+ */
+__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               const int4 starts,
+                               const int4 strides)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int4 indices_in =
+    {
+            starts.x + (strides.x * get_global_id(0)),
+            starts.y + (strides.y * get_global_id(1)),
+            starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)),
+            starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)),
+    };
+    *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
+}
+#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
index 0b0cf8218..d97f23a47 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
index deadf8412..0292fab04 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "helpers.h"
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
index cac0c071e..c2c2d89a4 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -2,25 +2,17 @@
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
  * Copyright (c) 2017 ARM Limited.
  *
- * SPDX-License-Identifier: MIT
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 // reference:
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
new file mode 100644
index 000000000..1fdd2f98f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ActivationLayerInfoEx &act_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::F16, DataType::F32);
+
+  // Checks performed when output is configured
+  if ((output != nullptr) && (output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  if (output != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, *input);
+  }
+
+  const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+  bool window_changed = false;
+
+  if (output != nullptr)
+  {
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+  }
+  else
+  {
+    window_changed = update_window_and_padding(
+        win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+  }
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLActivationLayerExKernel::CLActivationLayerExKernel()
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
+                                          ActivationLayerInfoEx act_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _run_in_place = (output == nullptr) || (output == input);
+
+  if (output != nullptr)
+  {
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+  }
+
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
+
+  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+  const DataType dt = input->info()->data_type();
+  float a_const = act_info.a();
+  float b_const = act_info.b();
+  int a_const_int = 0;
+  int b_const_int = 0;
+
+  // Create quantized version of constants a, b if needed
+  if (is_data_type_quantized(dt))
+  {
+    a_const_int =
+        input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
+    b_const_int =
+        input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
+  }
+
+  // Set build options
+  std::set<std::string> build_opts;
+  build_opts.emplace(
+      ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized(dt))
+  {
+    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+
+    const int o1 = input->info()->quantization_info().offset;
+    // Quantized value of 0 corresponds to the offset o1
+    build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
+
+    // Set scale and offset of the input and output if they have different quantization info
+    if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
+    {
+      const float s1 = input->info()->quantization_info().scale;
+      const float s2 = output->info()->quantization_info().scale;
+      const int o2 = output->info()->quantization_info().offset;
+
+      if (o1 != o2 || s1 != s2)
+      {
+        build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+        build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+        build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+      }
+    }
+  }
+  else
+  {
+    build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+    build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+  }
+
+  build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
+
+  // Create kernel
+  std::string kernel_name = std::string("activation_layer_ex");
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Make sure _kernel is initialized before calling the parent's configure
+  _input = input;
+  _output = output;
+
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set config_id for enabling LWS tuning
+  _config_id = "activation_layer_ex_";
+  _config_id += lower_string(string_from_data_type(dt));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(0));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                           const ActivationLayerInfoEx &act_info)
+{
+  const bool run_in_place = (output == nullptr) || (output == input);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(),
+                                    (run_in_place) ? nullptr : output->clone().get())
+          .first);
+
+  return Status{};
+}
+
+void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    if (!_run_in_place)
+    {
+      add_3D_tensor_argument(idx, _output, slice);
+    }
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
new file mode 100644
index 000000000..c1a2ad0be
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(argminmax_axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const uint32_t argminmax_axis, ArgOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+                                                       DataType::U8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match argminmax_axis");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      argminmax_axis >= 0 && argminmax_axis < num_dimensions,
+      "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
+  return Status{};
+}
+
+} // namespace
+
+CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
+
+void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                  const uint32_t argminmax_axis, ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
+
+  _input = input;
+  _output = output;
+  _argminmax_axis = argminmax_axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
+
+  // Construct kernel name for argmax and argmin based on axis
+  std::string kernel_name = "arg_op";
+  int op_code = 0;
+  if (op == ArgOperation::MAX)
+  {
+    op_code = 1;
+  }
+  else if (op == ArgOperation::MIN)
+  {
+    op_code = 2;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const uint32_t argminmax_axis, ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
+
+  return Status{};
+}
+
+void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _argminmax_axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
new file mode 100644
index 000000000..1c505b4d5
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output, ConvertPolicy policy)
+{
+  ARM_COMPUTE_UNUSED(policy);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
+
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        output->data_type() == DataType::U8 &&
+            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+        "Output can only be U8 if both inputs are U8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
+                                                        ITensorInfo *output)
+{
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output, out_shape);
+
+    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+    {
+      set_format_if_unknown(*output, Format::S16);
+    }
+    else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output, Format::F16);
+    }
+    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output, Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                                ICLTensor *output, ConvertPolicy policy)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input1->info(), input2->info(), output->info(), policy));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
+
+  ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
+                                                 const ITensorInfo *input2,
+                                                 const ITensorInfo *output, ConvertPolicy policy)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
+                                                            input2->clone().get(),
+                                                            output->clone().get())
+                                  .first);
+
+  return Status{};
+}
+
+void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLArithmeticSubtractionExKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
new file mode 100644
index 000000000..b0016d23c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t *block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
+                                  "Input Depth should be equal to Output Depth");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
+      "Input batch should be equal to (output batch * block size[0] *block size[1])");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
+                                      !(output->dimension(1) % block_size[0]),
+                                  "Output height and width should be divisible by block size[0] "
+                                  "and block_size[1] respectively");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
+                                      (output->dimension(1) == input->dimension(1) * block_size[0]),
+                                  "Output height and width should be equal to "
+                                  "input_height*blocksize[0] and input_width*blocksize[1] "
+                                  "respectively");
+
+  return Status{};
+}
+
+} // namespace
+
+CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                       const int32_t *block_size)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
+  build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
+  build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_out);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..3d2f2c702
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+                           const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                        ICLTensor *output, BinaryLogicalOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "binary_logical_op";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+  int op_code = 0;
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      op_code = 1;
+      break;
+    case BinaryLogicalOperation::OR:
+      op_code = 2;
+      break;
+    default:
+      throw std::runtime_error("Operation not supported, yet");
+  }
+
+  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
index b019e8c33..bf7ebae3f 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -17,15 +17,8 @@
 #include "arm_compute/core/CL/kernels/CLCastKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
 
@@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
   {
     const float scale_in = input->info()->quantization_info().scale;
     const int offset_in = input->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
 
     _kernel = static_cast<cl::Kernel>(
         CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
@@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
   {
     const float scale_in = output->info()->quantization_info().scale;
     const int offset_in = output->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
 
     _kernel = static_cast<cl::Kernel>(
         CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
@@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
   update_window_and_padding(win, input_access, output_access);
   output_access.set_valid_region(win, input->info()->valid_region());
 
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
new file mode 100644
index 000000000..5af5b16ea
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32, DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                     ICLTensor *output, const ComparisonOperation &op)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "comparison_op";
+  int op_code = 0;
+
+  switch (op)
+  {
+    case ComparisonOperation::EQUAL:
+      op_code = 1;
+      break;
+    case ComparisonOperation::NOT_EQUAL:
+      op_code = 2;
+      break;
+    default:
+      throw std::runtime_error(" Operation not supported, yet");
+  }
+
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+  build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
+  build_opts.emplace(
+      ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
+      ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
+       (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
+  {
+    build_opts.emplace("-DOFFSET_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().offset));
+    build_opts.emplace("-DOFFSET_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().scale));
+    kernel_name += "_qasymm8";
+  }
+
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input1->info()->data_type() == DataType::S16 ||
+        input2->info()->data_type() == DataType::S16)
+    {
+      set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if (input1->info()->data_type() == DataType::F16 &&
+             input2->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input1->info()->data_type() == DataType::F32 ||
+             input2->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLComparisonOpKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..c386e3312
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
+                                  "Output width should be equal to (Input width * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
+                                  "Output height should be equal to (Input height * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
+                                  "Input depth should be divisible by (block size * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      output->dimension(2) == input->dimension(2) / (block_size * block_size),
+      "Output depth should be equal to (Input depth / (block size * block size))");
+
+  return Status{};
+}
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const ICLTensor *lookups)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "embedding_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
new file mode 100644
index 000000000..b1ee21bdc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Auto initialize output
+  auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
index 23efafa6a..ae2801e2b 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -17,26 +17,14 @@
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
 
 using namespace arm_compute;
 
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int num_elems_processed_per_iteration = 1;
 
 Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
                           const ITensorInfo *output)
@@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
                                                        DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
 
   return Status{};
 }
@@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n
 void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
 
   _input1 = input1;
   _input2 = input2;
@@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
       static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   // Configure kernel window
-  const unsigned int num_elems_processed_per_iteration = 1;
   Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
   output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..cd7b21c6d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Output's shape was not set");
+
+  ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
+                       output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Make _lookup_indices tensor
+  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+  _lookup_indices->allocator()->init(
+      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+  _lookup_indices->allocator()->allocate();
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "hashtable_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const_cast<ICLTensor *>(_lookups)->map(queue);
+  const_cast<ICLTensor *>(_keys)->map(queue);
+  _hits->map(queue);
+  _lookup_indices->map(queue);
+
+  // Set values of hits
+  const int32_t *lookups_buf =
+      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+  std::map<int32_t, size_t> key_map;
+  const size_t keys_num = _keys->info()->dimension(0);
+  for (size_t key_index = 0; key_index < keys_num; key_index++)
+  {
+    key_map[keys_buf[key_index]] = key_index;
+  }
+
+  const size_t lookups_num = _lookups->info()->dimension(0);
+  for (size_t i = 0; i < lookups_num; ++i)
+  {
+    const auto lookup_value = lookups_buf[i];
+    const auto it = key_map.find(lookup_value);
+    if (it != key_map.end())
+    {
+#if defined(DEBUG)
+      if (it->second >= lookups_num)
+        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(DEBUG)
+      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+      hits_buf[i] = static_cast<uint8_t>(1);
+    }
+    else
+    {
+      lookup_indices_buf[i] = -1;
+      hits_buf[i] = static_cast<uint8_t>(0);
+    }
+  }
+
+  const_cast<ICLTensor *>(_lookups)->unmap(queue);
+  const_cast<ICLTensor *>(_keys)->unmap(queue);
+  _hits->unmap(queue);
+  _lookup_indices->unmap(queue);
+
+  Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, win);
+    add_4D_tensor_argument(idx, _output, win);
+    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+    enqueue(queue, *this, win);
+  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..80d99dd3b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
+                                              output->info()->tensor_shape());
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+  // Configure window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
new file mode 100644
index 000000000..12bbe910f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+  // Checks performed when output is configured
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                        NormalizationLayerInfo norm_info)
+{
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output, *input->clone());
+
+  const unsigned int norm_size = norm_info.norm_size();
+  bool is_in_map = norm_info.is_in_map();
+
+  const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
+  const BorderSize border_size = BorderSize(0, border_width);
+
+  const unsigned int num_elems_processed_per_iteration = 4;
+  const unsigned int num_elems_read_per_iteration =
+      is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
+                : num_elems_processed_per_iteration;
+
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+  // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
+  // the kernel, avoiding padding
+  AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+  output_access.set_valid_region(win, input->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
+    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
+{
+}
+
+BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
+
+void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                             NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output->info(), *input->info()->clone());
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+
+  _input = input;
+  _output = output;
+
+  const unsigned int num_elems_processed_per_iteration = 4;
+  const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.add_option(
+      ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+  build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+  build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+  build_opts.add_option(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
+  build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+  build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
+
+  // Create kernel
+  std::string kernel_name =
+      _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set config_id for enabling LWS tuning
+  _config_id = "normalization_layer_";
+  _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(
+      static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(norm_info.norm_size());
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(0));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+
+  return Status{};
+}
+
+void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
+  Window slice = window_collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+  } while (window_collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..241f8ae4d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                       DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+                                                       DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
+
+  _input = input;
+  _alpha = alpha;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "prelu";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+  {
+    build_opts.emplace("-DOFF_IN1=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+    build_opts.emplace("-DOFF_IN2=" +
+                       support::cpp11::to_string(alpha->info()->quantization_info().offset));
+    build_opts.emplace("-DOFF_OUT=" +
+                       support::cpp11::to_string(output->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN1=" +
+                       support::cpp11::to_string(input->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_IN2=" +
+                       support::cpp11::to_string(alpha->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_OUT=" +
+                       support::cpp11::to_string(output->info()->quantization_info().scale));
+    kernel_name += "_qasymm8";
+  }
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input->info()->data_type() == DataType::F32 ||
+             alpha->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input->info()->tensor_shape();
+  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice_input1);
+    add_3D_tensor_argument(idx, _alpha, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
new file mode 100644
index 000000000..99b54c822
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
+                          const ITensorInfo *pad_size_info)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
+                                      input_info->num_dimensions() <= 4,
+                                  "Pad kernel supports upto 4-D input tensor");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input_info->num_dimensions() == output_info->num_dimensions(),
+      "output tensor should have same number of dimensions as input tensor");
+
+  if (input_info->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
+                                        output_info->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
+
+void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
+
+  _input = input;
+  _output = output;
+  _pad_size = pad_size;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
+  build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
+  build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
+  build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  _pad_size->map(queue);
+
+  // Padding values only for up, top, left and front are required based on the rank of tensor
+  int rank = _pad_size->info()->dimension(1);
+
+  auto pad_batch_up =
+      (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
+  auto pad_height_top =
+      (rank >= 2)
+          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
+          : 0;
+  auto pad_width_left = (rank >= 1)
+                            ? *reinterpret_cast<const int32_t *>(
+                                  _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
+                            : 0;
+  auto pad_depth_front =
+      (rank >= 3)
+          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
+          : 0;
+
+  _pad_size->unmap(queue);
+
+  // Pad_values which needs to be passed
+  const cl_int4 paddingValues = {
+      {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
+       static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    _kernel.setArg<cl_int4>(idx++, paddingValues);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
new file mode 100644
index 000000000..aa094761c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
+{
+  TensorShape output_shape = input->tensor_shape();
+  permute(output_shape, perm);
+  return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const PermutationVector &perm)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  const TensorShape output_shape =
+      misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+
+  // Validate configured output
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+  return Status{};
+}
+} // namespace
+
+CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
+
+void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                  const PermutationVector &perm)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
+
+  _input = input;
+  _output = output;
+  _perm = perm;
+
+  const TensorShape output_shape = get_output_shape(input->info(), perm);
+  // Output auto inizialitation if not yet initialized
+  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+  // Create kernel
+  std::set<std::string> build_opts;
+
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+  // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
+  build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
+  build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
+  build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
+  build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const PermutationVector &perm)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
+
+  return Status{};
+}
+
+void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
index a3e0163de..b985aa737 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
@@ -17,20 +17,8 @@
 #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
 
 using namespace arm_compute;
 
@@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
   ARM_COMPUTE_UNUSED(overflow_policy);
   ARM_COMPUTE_UNUSED(rounding_policy);
 
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8,
-                                                       DataType::QS16, DataType::S16, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8,
-                                                       DataType::QS16, DataType::S16, DataType::F16,
-                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
 
   const TensorShape &out_shape =
@@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
                                   "Inputs are not broadcast compatible");
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
-
-  if (is_data_type_fixed_point(input1->data_type()))
-  {
-    // All data types must be all QS8 or all QS16
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1,
-                                    "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
-  }
 
   // Validate in case of configured output
   if (output->total_size() > 0)
   {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8,
-                                                         DataType::QS16, DataType::S16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
         output->data_type() == DataType::U8 &&
@@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
         detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
         "Wrong shape for output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
-    if (is_data_type_fixed_point(input1->data_type()))
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-    }
   }
 
   return Status{};
@@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
     {
       compute_type = "int";
     }
-    else if (input1->info()->data_type() == DataType::QS8)
-    {
-      compute_type = "qs8";
-    }
-    else if (input1->info()->data_type() == DataType::QS16)
-    {
-      compute_type = "qs16";
-    }
     else
     {
       compute_type = "ushort";
@@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
           : "-DSATURATE");
   build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
                                                                   : "-DROUND=_rte");
-  if (is_data_type_fixed_point(input1->info()->data_type()))
-  {
-    build_opts.emplace("-DFIXED_POINT_POSITION=" +
-                       support::cpp11::to_string(input1->info()->fixed_point_position()));
-  }
   build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
   build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
   build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
     _kernel.setArg(idx++, scale);
   }
 
-  ICLKernel::configure(win_config.second);
+  ICLKernel::configure_internal(win_config.second);
 }
 
 Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
deleted file mode 100644
index 168b246bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
-  // We can handle for simple case only
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
-                                    "Output same type allowed for input and output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
-                                    "Only support for output dimension 1");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
-                                    "Only support for input dimension 2");
-  }
-
-  return Status{};
-}
-
-} // namespace
-
-CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
-
-void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  // Configure kernel window
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  Window win;
-  win.set(0, Window::Dimension(0, cols, 1));
-  win.set(1, Window::Dimension(0, rows, 1));
-
-  // Construct kernel name
-  std::string kernel_name = "reduce_max";
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  ICLKernel::configure(win);
-}
-
-Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
-                                   const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
-
-  return Status{};
-}
-
-void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window window_input = window;
-  Window slice_input = window_input.first_slice_window_1D();
-
-  do
-  {
-    Window slice_output = slice_input.shift_dimensions(1);
-    unsigned int idx = 0;
-    add_1D_tensor_argument(idx, _input, slice_input);
-    add_1D_tensor_argument(idx, _output, slice_output);
-    enqueue(queue, *this, slice_input);
-
-  } while (window_input.slide_window_slice_1D(slice_input));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..f581780e1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                          ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32, DataType::S32);
+  if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+                                    "Not support QASYMM8, yet");
+  }
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      axis >= 0 && axis < num_dimensions,
+      "axis must be greater than or equal to 0 and less than (input's rank).");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match axis");
+
+  return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const uint32_t axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  _input = input;
+  _output = output;
+  _axis = axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+  // Construct kernel name
+  std::string kernel_name;
+  int op_code = 0;
+  if (op == ReduceOperation::MAX)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 1;
+  }
+  else if (op == ReduceOperation::MIN)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 2;
+  }
+  else if (op == ReduceOperation::SUM)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 3;
+  }
+  else if (op == ReduceOperation::MEAN)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 4;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const uint32_t axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+  return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+  // Support dimensions up to 4
+  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+  // of input and output are the same
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+  idx = 0;
+  add_4D_tensor_argument(idx, _input, slice_in);
+  add_4D_tensor_argument(idx, _output, slice_out);
+  enqueue(queue, *this, slice_out);
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
deleted file mode 100644
index 84a77122d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions,
-                                  "Reduction axis greater than max number of dimensions");
-
-  std::vector<uint32_t>::const_iterator it;
-  bool axis_w = false;
-  bool axis_h = false;
-  for (it = axis.begin(); it != axis.end(); ++it)
-  {
-    if ((*it) == 0)
-    {
-      axis_w = true;
-    }
-    else if ((*it) == 1)
-    {
-      axis_h = true;
-    }
-    else
-    {
-      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
-    }
-  }
-  // TODO Other axises (currently, only axises for both width and height are supported.)
-  if (!axis_w || !axis_h)
-  {
-    ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
-  }
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                         std::vector<uint32_t> axis)
-{
-  // Output tensor auto initialization if not yet initialized
-  TensorShape output_shape{input->tensor_shape()};
-  output_shape.set(0, 1);
-  output_shape.set(1, 1);
-  auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(),
-                     input->fixed_point_position());
-
-  // Configure kernel window
-  constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
-  const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-
-  Window win = calculate_max_window(
-      *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-  AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
-                                     num_elems_processed_per_iteration_y);
-  AccessWindowHorizontal output_access(output, 0, 1);
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-
-  return std::make_tuple(err, win);
-}
-} // namespace
-
-CLReductionMeanKernel::CLReductionMeanKernel()
-    : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
-{
-}
-
-BorderSize CLReductionMeanKernel::border_size() const { return _border_size; }
-
-void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                      std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
-
-  _input = input;
-  _output = output;
-  _reduction_axis = axis;
-
-  constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
-
-  // Set border size
-  _border_size = BorderSize(
-      ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) -
-      input->info()->dimension(0));
-
-  // Set build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  // build_opts.emplace(("-DVEC_SIZE=" +
-  // support::cpp11::to_string(num_elems_processed_per_iteration)));
-  if (is_data_type_fixed_point(input->info()->data_type()))
-  {
-    build_opts.emplace("-DFIXED_POINT_POSITION=" +
-                       support::cpp11::to_string(input->info()->fixed_point_position()));
-  }
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
-
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  ICLKernel::configure(std::get<1>(win_config));
-}
-
-Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                       std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
-      validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
-
-  return Status{};
-}
-
-void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  // Set out window
-  Window out_window(window);
-  out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  // Get first input and output slices
-  Window in_slice = window.first_slice_window_2D();
-  Window out_slice = out_window.first_slice_window_2D();
-
-  // Set local sums buffer
-  // TODO work_group
-  unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
-
-  unsigned int idx = 2 * num_arguments_per_2D_tensor();
-  _kernel.setArg(idx++, local_sum_size, nullptr);
-  _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height
-  _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) *
-                                                    _input->info()->dimension(1))); // divider
-
-  do
-  {
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input, in_slice);
-    in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
-    add_2D_tensor_argument(idx, _output, out_slice);
-    enqueue(queue, *this, in_slice);
-  } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..6b0697e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+                          const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+                                  "The number of dimensions of input should be equal to output");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+                                  "The input and output layouts are different!");
+
+  // TODO Support other cases
+  if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+                                  "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+  if (input->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                       const ICLTensor *padding_size, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+  _input = input;
+  _block_size = block_size;
+  _padding_size = padding_size;
+  _output = output;
+
+  // Set kernel build options
+  // TODO Support other cases
+  std::string kernel_name = "space_to_batch_4d";
+  std::set<std::string> build_opts;
+  Window win;
+
+  if (input->info()->data_layout() == DataLayout::NCHW)
+  {
+    kernel_name += "_nchw";
+    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+    win = calculate_max_window(*output->info(), Steps());
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+  }
+  else if (input->info()->data_layout() == DataLayout::NHWC)
+  {
+    kernel_name += "_nhwc";
+    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    input_access.set_valid_region(win, output->info()->valid_region());
+
+    if (window_changed)
+    {
+      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported layout");
+  }
+
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure kernel window
+  ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(DEBUG)
+  const_cast<ICLTensor *>(_block_size)->map(queue);
+  const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+  const size_t num_dimensions = _input->info()->num_dimensions();
+  const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+  int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+  for (size_t i = 0; i < num_spacial_dimensions; ++i)
+  {
+    const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+    const int32_t padding_size_pre =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+    const int32_t padding_size_post =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+    ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+    ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+                             "Padding size should be greater than or equal to 0");
+
+    if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(i) !=
+              (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+    else
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+              (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+               padding_size_pre + padding_size_post) /
+                  block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+
+    batch_size *= block_size;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(
+      _output->info()->dimension(num_dimensions - 1) != batch_size,
+      "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+  const_cast<ICLTensor *>(_block_size)->unmap(queue);
+  const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(DEBUG)
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Set block size window
+  Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+  // Set padding size window
+  Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    add_1D_tensor_argument(idx, _block_size, win_block);
+    add_2D_tensor_argument(idx, _padding_size, win_padding);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..5d6329edc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
+                                  "Input batch should be equal to Output batch");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input->dimension(2) * block_size * block_size == output->dimension(2),
+      "Output depth should be equal to (input depth * block size *block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
+                                      !(input->dimension(1) % block_size),
+                                  "Input height and width should be divisible by block size");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
+                                      (output->dimension(1) == (input->dimension(1) / block_size)),
+                                  "Output height and width should be equal to "
+                                  "input_height/blocksize and input_width/blocksize respectively");
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
new file mode 100644
index 000000000..260bc39f1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                          ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input1->info()->data_type() == DataType::F16 &&
+        input2->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input1->info()->data_type() == DataType::F32 ||
+             input2->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLSquaredDifferenceKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
index 80ffd423a..48146a43a 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
@@ -14,43 +14,30 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 
-#include <string>
-
-using namespace std;
 using namespace arm_compute;
 
-static const int32_t maxDim = 4;
-
-CLStridedSliceKernel::CLStridedSliceKernel()
+CLStridedSliceExKernel::CLStridedSliceExKernel()
     : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
       _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
 {
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const ITensorInfo *begin, const ITensorInfo *end,
-                                      const ITensorInfo *strides, int32_t beginMask,
-                                      int32_t endMask, int32_t shrinkAxisMask)
+Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        const ITensorInfo *begin, const ITensorInfo *end,
+                                        const ITensorInfo *strides, int32_t beginMask,
+                                        int32_t endMask, int32_t shrinkAxisMask)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16,
-      DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
@@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
   return stop;
 }
 
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
-  int32_t offset = b * shape[2] * shape[1] * shape[0];
-  offset += d * shape[1] * shape[0];
-  offset += h * shape[0];
-  offset += w;
-  return offset;
-}
-
 inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
 {
   int32_t ret = 0;
@@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
   return ret;
 }
 
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     ICLTensor *beginData, ICLTensor *endData,
-                                     ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                                     int32_t shrinkAxisMask)
+void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                       ICLTensor *beginData, ICLTensor *endData,
+                                       ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                                       int32_t shrinkAxisMask)
 {
   ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
                                       endData->info(), stridesData->info(), beginMask, endMask,
@@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
   _endMask = endMask;
   _shrinkAxisMask = shrinkAxisMask;
 
-  constexpr unsigned int num_elems_processed_per_iteration = 1;
-
   // Set kernel build options
   std::set<std::string> build_opts;
   build_opts.emplace("-DELEMENT_DATA_TYPE=" +
                      get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
 
   // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
 
-  // Create output's window without padding
-  TensorShape collapsed = output->info()->tensor_shape();
-  collapsed.collapse(4);
-  TensorInfo info = *output->info();
-  info.set_tensor_shape(collapsed);
-  Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
-
-  ICLKernel::configure(win);
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+  ICLKernel::configure_internal(win);
 }
 
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
 {
   ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
   ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-  // Create input window
-  TensorShape collapsed = _input->info()->tensor_shape();
-  collapsed.collapse(4);
-  TensorInfo info = *_input->info();
-  info.set_tensor_shape(collapsed);
-  Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
-
   _beginData->map(queue);
   _endData->map(queue);
   _stridesData->map(queue);
 
-  std::vector<int32_t> dimsIn;
-  std::vector<int32_t> dimsOut;
   std::vector<int32_t> starts;
-  std::vector<int32_t> stops;
   std::vector<int32_t> strides;
 
   for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
@@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
         StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
                      reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
 
-    stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
-                                   reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape,
-                                   n));
-
     strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
-    dimsIn.emplace_back(shape[n]);
-    dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
   }
 
   for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
   {
     starts.emplace_back(0);
-    stops.emplace_back(1);
     strides.emplace_back(1);
-    dimsIn.emplace_back(1);
-    dimsOut.emplace_back(1);
   }
   // TODO: Apply shrinkAxisMask
 
@@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
   _stridesData->unmap(queue);
   _endData->unmap(queue);
 
-  // Set parameters
-  unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-  const cl_int4 dimsInArg = {{
-      static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]),
-      static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, dimsInArg);
-
-  const cl_int4 dimsOutArg = {{
-      static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]),
-      static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, dimsOutArg);
-
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
   const cl_int4 startsArg = {{
       static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
       static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
@@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
   }};
   _kernel.setArg<cl_int4>(idx++, stridesArg);
 
-  // TODO: Apply slicing output's window
-  idx = 0;
-  add_1D_tensor_argument(idx, _input, win_in);
-  add_1D_tensor_argument(idx, _output, window);
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
 
-  enqueue(queue, *this, window);
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
index d95b485b7..073c2f7bb 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -17,15 +17,8 @@
 #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <climits>
-#include <cassert>
 
 namespace arm_compute
 {
@@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, 1, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
@@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
@@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
@@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
@@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
@@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, k, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
diff --git a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp
new file mode 100644
index 000000000..3b5782c25
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEMath.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared,
+                          const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+
+  // Checks performed when output is configured
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+                                                        ITensorInfo *input_squared,
+                                                        ITensorInfo *output,
+                                                        const NormalizationLayerInfo &norm_info)
+{
+  unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+  const unsigned int num_elems_read_per_iteration =
+      num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+  const unsigned int num_rows =
+      (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+  const unsigned int border_width =
+      (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+  BorderSize border_size = BorderSize(0, border_width);
+  bool window_changed = false;
+
+  // Configure window
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+  AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration,
+                                     num_rows);
+  AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0,
+                                             num_elems_read_per_iteration, num_rows);
+
+  if (output->total_size() != 0)
+  {
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    window_changed =
+        update_window_and_padding(win, input_access, input_squared_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+  }
+  else
+  {
+    window_changed = update_window_and_padding(win, input_access, input_squared_access);
+  }
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+NENormalizationLayerExKernel::NENormalizationLayerExKernel()
+    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr),
+      _norm_info(NormType::IN_MAP_1D), _border_size()
+{
+}
+
+BorderSize NENormalizationLayerExKernel::border_size() const { return _border_size; }
+
+void NENormalizationLayerExKernel::configure(const ITensor *input, const ITensor *input_squared,
+                                             ITensor *output, NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output->info(), *input->info());
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
+
+  const unsigned int border_width =
+      (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+
+  _input = input;
+  _input_squared = input_squared;
+  _output = output;
+  _norm_info = norm_info;
+  _border_size = BorderSize(0, border_width);
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::F32:
+    {
+      switch (norm_info.type())
+      {
+        case NormType::IN_MAP_1D:
+          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, false>;
+          break;
+        case NormType::IN_MAP_2D:
+          // Normalize over X and Y
+          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, true>;
+          break;
+        case NormType::CROSS_MAP:
+          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 2, false>;
+          break;
+        default:
+          break;
+      }
+      break;
+    }
+    case DataType::F16:
+    {
+      switch (norm_info.type())
+      {
+        case NormType::IN_MAP_1D:
+          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, false>;
+          break;
+        case NormType::IN_MAP_2D:
+          // Normalize over X and Y
+          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, true>;
+          break;
+        case NormType::CROSS_MAP:
+          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 2, false>;
+          break;
+        default:
+          break;
+      }
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+  }
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), input_squared->info(),
+                                                  output->info(), norm_info);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  INEKernel::configure(win_config.second);
+}
+
+template <DataType dt, unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerExKernel::normalize_float(const Window &window)
+{
+  Iterator input(_input, window);
+  Iterator input_squared(_input_squared, window);
+  Iterator output(_output, window);
+
+  const int dim_y = 1;
+  const int radius = _norm_info.norm_size();
+  const int total_size = _input->info()->dimension(dim) - 1;
+  const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+  // We account padding across X only and we iterate over rows
+  const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+  const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
+  const int min_top = 0;
+  const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+  if (dt == DataType::F32)
+  {
+    const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+    const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
+    const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id) {
+          // Get range to normalize
+          const int current_row = do_2D_norm ? id[dim_y] : 0;
+          const int current_slice = id[dim];
+          const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+          const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+          const int first_slice = std::max(current_slice - radius, min_left);
+          const int last_slice = std::min(current_slice + radius, max_right);
+
+          // Accumulate 2D In-Map values
+          float32x4_t accu = vdupq_n_f32(0.f);
+          for (int j = first_row; j <= last_row; j++)
+          {
+            // Compute row displacement
+            const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr =
+                input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for (int i = first_slice; i <= last_slice; ++i)
+            {
+              accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(
+                                         input_squared_ptr + i * input_squared_stride)));
+            }
+          }
+
+          // Normalize
+          const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+          const float32x4_t normalized_pixel = vmulq_f32(
+              vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+          vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+  }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  else if (dt == DataType::F16)
+  {
+    const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff());
+    const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
+    const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa());
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id) {
+          // Get range to normalize
+          const int current_row = do_2D_norm ? id[dim_y] : 0;
+          const int current_slice = id[dim];
+          const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+          const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+          const int first_slice = std::max(current_slice - radius, min_left);
+          const int last_slice = std::min(current_slice + radius, max_right);
+
+          // Accumulate 2D In-Map values
+          float16x8_t accu = vdupq_n_f16(0.f);
+          for (int j = first_row; j <= last_row; j++)
+          {
+            // Compute row displacement
+            const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr =
+                input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for (int i = first_slice; i <= last_slice; ++i)
+            {
+              accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(
+                                         input_squared_ptr + i * input_squared_stride)));
+            }
+          }
+
+          const float16x8_t norm_f16 =
+              vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
+          const float16x8_t normalized_pixel = vmulq_f16(
+              vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
+          vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+  }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+  else
+  {
+    ARM_COMPUTE_ERROR("Not supported");
+  }
+}
+
+Status NENormalizationLayerExKernel::validate(const ITensorInfo *input,
+                                              const ITensorInfo *input_squared,
+                                              const ITensorInfo *output,
+                                              const NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                            input_squared->clone().get(),
+                                                            output->clone().get(), norm_info)
+                                  .first);
+
+  return Status{};
+}
+
+void NENormalizationLayerExKernel::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+  // Run function
+  (this->*_func)(window);
+}
diff --git a/libs/ARMComputeEx/src/core/UtilsEx.cpp b/libs/ARMComputeEx/src/core/UtilsEx.cpp
new file mode 100644
index 000000000..b63093bbb
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/UtilsEx.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/UtilsEx.h"
+
+#include <cstdint>
+#include <fstream>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &
+arm_compute::string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act)
+{
+  static std::map<ActivationLayerInfoEx::ActivationFunction, const std::string> act_map = {
+      {ActivationLayerInfoEx::ActivationFunction::RSQRT, "RSQRT"},
+  };
+
+  return act_map[act];
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp
new file mode 100644
index 000000000..1e52fc429
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLActivationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                    ActivationLayerInfoEx act_info)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerExKernel>();
+  k->configure(input, output, act_info);
+  _kernel = std::move(k);
+}
+
+Status CLActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                     const ActivationLayerInfoEx &act_info)
+{
+  return CLActivationLayerExKernel::validate(input, output, act_info);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp
new file mode 100644
index 000000000..dff743e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgMinMax.h"
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgMinMax::CLArgMinMax()
+    : _input(nullptr), _output(nullptr), _argminmax_axis(), _interm_tensors(), _argminmax_kernels(),
+      _num_of_kernels()
+{
+}
+
+void CLArgMinMax::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+                            ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op));
+  _input = input;
+  _output = output;
+  _argminmax_axis = axis;
+  _arg_op = op;
+  // NOTE The argminmax_axis must have no duplication.
+  _num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _argminmax_kernels =
+      arm_compute::support::cpp14::make_unique<CLArgMinMaxKernel[]>(_num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(_argminmax_axis[i], 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ArgMinMax on all kernels
+  for (size_t i = 0; i < _num_of_kernels; i++)
+  {
+    _argminmax_kernels[i].configure(tensors[i], tensors[i + 1], _argminmax_axis[i], op);
+  }
+}
+
+Status CLArgMinMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis,
+                             const ITensorInfo *output, ArgOperation op)
+{
+  const size_t num_of_kernels = argminmax_axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(argminmax_axis[i], 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate argminmax only on all kernels
+  for (size_t i = 0; i < num_of_kernels; i++)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxKernel::validate(tensors[i], tensors[i + 1], argminmax_axis[i], op));
+  }
+
+  return Status{};
+}
+
+void CLArgMinMax::run()
+{
+  for (size_t i = 0; i < _num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_argminmax_kernels[i]);
+  }
+}
+
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp
new file mode 100644
index 000000000..3f403c80a
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+using namespace arm_compute;
+
+void CLArithmeticSubtractionEx::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                          ConvertPolicy policy)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionExKernel>();
+  k->configure(input1, input2, output, policy);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
+
+Status CLArithmeticSubtractionEx::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                           const ITensorInfo *output, ConvertPolicy policy)
+{
+  return CLArithmeticSubtractionExKernel::validate(input1, input2, output, policy);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
new file mode 100644
index 000000000..26e3798cc
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+using namespace arm_compute;
+
+void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                  BinaryLogicalOperation op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
index e1059ab53..8e106737c 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -17,7 +17,6 @@
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
 #include "arm_compute/core/CL/kernels/CLCastKernel.h"
-#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp
new file mode 100644
index 000000000..f6a745a25
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparisonOp.h"
+
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLComparisonOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                               const ComparisonOperation &op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLComparisonOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+                                  const ICLTensor *lookups)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp
new file mode 100644
index 000000000..411fa8700
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLExp.h"
+
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+using namespace arm_compute;
+
+void CLExp::configure(const ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLExpKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
index 5552cbc6f..fb056fe45 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
@@ -16,11 +16,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
 
 using namespace arm_compute;
 
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                  const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
new file mode 100644
index 000000000..276c4557a
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {}
+
+void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                       const NormalizationLayerInfo &norm_info)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+  // Configure normalization kernel
+  _norm_kernel.configure(input, output, norm_info);
+
+  // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+  _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        const NormalizationLayerInfo &norm_info)
+{
+  return CLNormalizationLayerExKernel::validate(input, output, norm_info);
+}
+
+void CLNormalizationLayerEx::run()
+{
+  // Run border handler
+  CLScheduler::get().enqueue(_border_handler, false);
+
+  // Run normalization kernel
+  CLScheduler::get().enqueue(_norm_kernel);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+  k->configure(input, alpha, output);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..5265b6c34
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,28 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+using namespace arm_compute;
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>();
+  k->configure(input, output, pad_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp
new file mode 100644
index 000000000..fb363270d
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPermuteEx.h"
+
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+using namespace arm_compute;
+
+void CLPermuteEx::configure(const ICLTensor *input, ICLTensor *output,
+                            const PermutationVector &perm)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPermuteExKernel>();
+  k->configure(input, output, perm);
+  _kernel = std::move(k);
+}
+
+Status CLPermuteEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                             const PermutationVector &perm)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteExKernel::validate(input, output, perm));
+  return Status{};
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
index e1add5e90..dc0baa8dd 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
@@ -18,9 +18,6 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
 
 using namespace arm_compute;
 
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
deleted file mode 100644
index 3382058db..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/ToolchainSupport.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include <vector>
-#include <algorithm>
-
-#include <utility>
-
-#define REDUCE_MAX_RUN_ON_CPU 1
-
-namespace arm_compute
-{
-
-CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
-
-void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
-{
-  _axis = axis;
-
-  _input = input;
-  _output = output;
-
-  auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
-  k->configure(input, axis, output);
-  _kernel = std::move(k);
-
-  // We can handle for simple case only
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-  ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
-  ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
-  ARM_COMPUTE_ERROR_ON(axis != 1);
-}
-
-Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
-  return CLReduceMaxKernel::validate(input, axis, output);
-}
-
-void CLReduceMax::run()
-{
-#if REDUCE_MAX_RUN_ON_CPU
-  run_on_cpu();
-
-  arm_compute::CLScheduler::get().sync();
-#else
-  arm_compute::CLScheduler::get().enqueue(*_kernel);
-#endif
-}
-
-void CLReduceMax::run_on_cpu()
-{
-  cl::CommandQueue q = CLScheduler::get().queue();
-
-  _input->map(q);
-  _output->map(q);
-
-  // Compute by CPU for simple case
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-
-  float *input_data = (float *)_input->buffer();
-  float *output_data = (float *)_output->buffer();
-
-  std::vector<float> container_max;
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  container_max.resize(rows);
-
-  // Initialize as 1st element in row
-  float *input_pointer = input_data;
-  for (int i = 0; i < rows; i++)
-  {
-    container_max[i] = *input_pointer;
-    input_pointer += cols;
-  }
-
-  // Update max value in row
-  for (int i = 0; i < rows; i++)
-  {
-    float max_in_row = container_max[i];
-    for (int j = 1; j < cols; j++)
-    {
-      if (max_in_row < input_data[i * cols + j])
-      {
-        max_in_row = input_data[i * cols + j];
-      }
-    }
-    container_max[i] = max_in_row;
-  }
-
-  for (int i = 0; i < rows; i++)
-  {
-    output_data[i] = container_max[i];
-  }
-
-  _input->unmap(q);
-  _output->unmap(q);
-}
-} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..2b8d82706
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation()
+    : _input(nullptr), _output(nullptr), _axis(), _interm_tensors(), _reduce_kernels()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const std::set<uint32_t> &axis, const ReduceOperation &op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceOperation only on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+  }
+
+  return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+                                  const std::set<uint32_t> &axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op));
+
+  _axis = axis;
+
+  _input = input;
+  _output = output;
+
+  // NOTE The axis must have no duplication.
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels =
+      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ReduceOperation on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+  }
+}
+
+void CLReduceOperation::run()
+{
+  const size_t num_of_kernels = _axis.size();
+  for (size_t i = 0; i < num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduce_kernels[i]);
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
deleted file mode 100644
index ab724e752..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {}
-
-Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                 std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis));
-  return Status{};
-}
-
-void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
-{
-  _reduction_mean_kernel.configure(input, output, axis);
-  _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT,
-                                PixelValue(0));
-}
-
-void CLReductionMean::run()
-{
-  CLScheduler::get().enqueue(_fill_border_kernel);
-  CLScheduler::get().enqueue(_reduction_mean_kernel);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                 const ICLTensor *padding_size, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+  k->configure(input, block_size, padding_size, output);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp
new file mode 100644
index 000000000..dc6e4af44
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSquaredDifference.h"
+
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLSquaredDifference::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSquaredDifferenceKernel>();
+  k->configure(input1, input2, output);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
deleted file mode 100644
index cd576cec1..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-#include <vector>
-
-using namespace arm_compute;
-
-static const int32_t maxDims = 4;
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices,
-                            std::vector<int32_t> const &strides, const TensorShape &inputShape,
-                            int32_t axis)
-{
-  // Begin with the specified index
-  int32_t start = startIndices[axis];
-
-  // beginMask override
-  if (beginMask & 1 << axis)
-  {
-    if (strides[axis] > 0)
-    {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axisSize-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int32_t>::lowest();
-    }
-    else
-    {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int32_t>::max();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (start < 0)
-  {
-    start += axisSize;
-  }
-
-  // Clamping
-  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
-  return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
-                           std::vector<int32_t> const &strides, const TensorShape &inputShape,
-                           int32_t axis)
-{
-  // Begin with the specified index
-  int32_t stop = stopIndices[axis];
-
-  // endMask override
-  if (endMask & (1 << axis))
-  {
-    if (strides[axis] > 0)
-    {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int32_t>::max();
-    }
-    else
-    {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int32_t>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (stop < 0)
-  {
-    stop += axisSize;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (strides[axis] > 0)
-  {
-    // Forward iteration
-    stop = arm_compute::utility::clamp(stop, 0, axisSize);
-  }
-  else
-  {
-    // Backward iteration
-    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
-  }
-
-  return stop;
-}
-
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
-  int32_t offset = b * shape[2] * shape[1] * shape[0];
-  offset += d * shape[1] * shape[0];
-  offset += h * shape[0];
-  offset += w;
-  return offset;
-}
-
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                               ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
-                               int32_t endMask, int32_t shrinkAxisMask)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
-  _kernel = std::move(k);
-}
-
-void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                                  ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
-                                  int32_t endMask, int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(
-      input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(),
-      beginMask, endMask, shrinkAxisMask));
-
-  _input = input;
-  _output = output;
-  _beginData = beginData;
-  _endData = endData;
-  _stridesData = stridesData;
-  _beginMask = beginMask;
-  _endMask = endMask;
-  _shrinkAxisMask = shrinkAxisMask;
-}
-
-void CLStridedSliceCPU::run()
-{
-  run_on_cpu();
-
-  arm_compute::CLScheduler::get().sync();
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
-  if (stride > 0)
-  {
-    return ((stop - start - 1) / stride) + 1;
-  }
-  else
-  {
-    return ((stop - start + 1) / stride) + 1;
-  }
-}
-
-template <typename T>
-inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask,
-                         int32_t endMask, const std::vector<int32_t> &startIndices,
-                         const std::vector<int32_t> &stopIndices,
-                         const std::vector<int32_t> &strides, T *outputData)
-{
-  ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
-  ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
-  ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
-
-  const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
-  const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
-  const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
-  const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
-  const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
-  const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
-  const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
-  const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
-
-  // The shape of outputData may collapse in one-dimension.
-  // Therefore, it is necessary to create a shape that matches the result of the outputData.
-  TensorShape outputShape(
-      getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
-      getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
-  for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b;
-       in_b += strides[3], b++)
-  {
-    for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d;
-         in_d += strides[2], d++)
-    {
-      for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h;
-           in_h += strides[1], h++)
-      {
-        for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w;
-             in_w += strides[0], w++)
-        {
-          outputData[offset4D(outputShape, b, d, h, w)] =
-              inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
-        }
-      }
-    }
-  }
-}
-
-void CLStridedSliceCPU::run_on_cpu()
-{
-  // TODO: Support shrinkAxisMask
-  cl::CommandQueue q = CLScheduler::get().queue();
-
-  _input->map(q);
-  _output->map(q);
-  _beginData->map(q);
-  _endData->map(q);
-  _stridesData->map(q);
-
-  TensorShape inputShape = _input->info()->tensor_shape();
-  TensorShape outputShape = _output->info()->tensor_shape();
-
-  std::vector<int32_t> starts;
-  std::vector<int32_t> stops;
-  std::vector<int32_t> strides;
-
-  for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx)
-  {
-    starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
-    stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
-    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
-  }
-
-  for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++)
-  {
-    starts.emplace_back(0);
-    stops.emplace_back(1);
-    strides.emplace_back(1);
-  }
-
-  switch (_input->info()->data_type())
-  {
-    case DataType::U8:
-    case DataType::QASYMM8:
-      StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<uint8_t *>(_output->buffer()));
-      break;
-    case DataType::S8:
-    case DataType::QS8:
-      StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
-      break;
-    case DataType::U16:
-      StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<uint16_t *>(_output->buffer()));
-      break;
-    case DataType::S16:
-    case DataType::QS16:
-      StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<int16_t *>(_output->buffer()));
-      break;
-    case DataType::F16:
-      // Not sure this works.
-      StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
-      break;
-    case DataType::U32:
-      StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<uint32_t *>(_output->buffer()));
-      break;
-    case DataType::S32:
-      StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<int32_t *>(_output->buffer()));
-      break;
-    case DataType::F32:
-      StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
-      break;
-    default:
-      ARM_COMPUTE_ERROR("DataType not supported");
-      break;
-  }
-
-  _input->unmap(q);
-  _output->unmap(q);
-  _beginData->unmap(q);
-  _endData->unmap(q);
-  _stridesData->unmap(q);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
new file mode 100644
index 000000000..be7353493
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h"
+
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
+
+using namespace arm_compute;
+
+void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+                                 int32_t endMask, int32_t shrinkAxisMask)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>();
+  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 6426364c9..19177497c 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -15,12 +15,9 @@
  * limitations under the License.
  */
 #include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-
-#include <vector>
-#include <algorithm>
 
 #include "../../topk_v2.h"
 
diff --git a/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp
new file mode 100644
index 000000000..988e92715
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/NEON/functions/NENormalizationLayerEx.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayerEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NENormalizationLayerEx::NENormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(),
+      _border_handler(), _input_squared()
+{
+}
+
+void NENormalizationLayerEx::configure(const ITensor *input, ITensor *output,
+                                       const NormalizationLayerInfo &norm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(),
+                         input->info()->quantization_info());
+  _input_squared.allocator()->init(tensor_info);
+
+  // Manage intermediate buffers
+  _memory_group.manage(&_input_squared);
+
+  // Configure kernels
+  _norm_kernel.configure(input, &_input_squared, output, norm_info);
+  _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE,
+                             RoundingPolicy::TO_ZERO);
+  _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT,
+                            PixelValue(0.0f));
+
+  // Allocate the tensor once the configure methods have been called
+  _input_squared.allocator()->allocate();
+}
+
+Status NENormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        const NormalizationLayerInfo &norm_info)
+{
+  // Perform validation step
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NENormalizationLayerExKernel::validate(input, input, output, norm_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(
+      input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+  return Status{};
+}
+
+void NENormalizationLayerEx::run()
+{
+  _memory_group.acquire();
+
+  NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+  NEScheduler::get().schedule(&_border_handler, Window::DimY);
+  NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+  _memory_group.release();
+}
diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h
index a18ff0b0d..f94effea1 100644
--- a/libs/ARMComputeEx/src/runtime/topk_v2.h
+++ b/libs/ARMComputeEx/src/runtime/topk_v2.h
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file topk_v2.h
+ * @brief This file contains TopK method and TopContainer class for TopK operation
+ * @ingroup COM_AI_RUNTIME
+ */
+
 #ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
 #define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
 
@@ -26,34 +32,62 @@ namespace rt
 {
 namespace optimized_ops
 {
-// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
-// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
-// TFLite.
-//(TFLite additionaly supports kTfLiteInt64.)
-
-// The class that collects top indexes of k values. Based on template
-// tensorflow::gtl::TopN<> but, for optimization,
-// it re-uses the same container.
+/**
+ * @brief class to define TopK operation
+ * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+ * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+ * TFLite.
+ * (TFLite additionaly supports kTfLiteInt64.)
+ *
+ * The class that collects top indexes of k values. Based on template
+ * tensorflow::gtl::TopN<> but, for optimization,
+ * it re-uses the same container.
+ */
 template <typename T> class TopContainer
 {
 public:
+  /**
+   * @brief Prevent default constructor of of this class
+   */
   TopContainer() = delete;
+  /**
+   * @brief Constructor with params
+   * @param [in] row_size Size of row in data
+   * @param [in] k The top k predictions
+   */
   TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
   {
     container_.reserve(std::min(k, row_size) + 1);
   }
 
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   */
   TopContainer(const TopContainer &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  /*
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   * @return Reference of TopContainer
+   */
   TopContainer &operator=(const TopContainer &) = delete;
 
+  /**
+   * @brief Start collecting
+   * @param [in] values To set as values
+   * @return N/A
+   */
   void start_collecting(const T *values)
   {
     values_ = values;
     container_.clear();
   }
 
+  /**
+   * @brief Push a value to be compared for topk
+   * @param [in] a A value to compare
+   * @return N/A
+   */
   void push(int32 a)
   {
     auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
@@ -74,6 +108,10 @@ public:
     }
   }
 
+  /**
+   * @brief Get sorted result from pushed values
+   * @return Reference of vector with sorted values
+   */
   const std::vector<int32> &sorted_result()
   {
     auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
@@ -111,6 +149,16 @@ private:
   }
 };
 
+/**
+ * @brief Operates TopK operation with params
+ * @param [in] row_size Size of row in data
+ * @param [in] num_rows The number of rows in data
+ * @param [in] data To be operated in
+ * @param [in] k The top k predictions
+ * @param [out] output_indexes Indexes of targets in the top k predictions
+ * @param [out] output_values Values of targets in the top k predictions
+ * @return N/A
+ */
 template <typename T>
 void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
           T *output_values)