60 files changed, 0 insertions, 8763 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
deleted file mode 100644
index 05ecdeb22..000000000
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <utility>
-#include <vector>
-
-using namespace arm_compute;
-
-const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
-    // ARMComputeEx kernels
-    {"activation_layer_ex", "activation_layer_ex.cl"},
-    {"arg_op", "arg_operation.cl"},
-    {"arithmetic_sub_ex", "arithmetic_op_ex.cl"},
-    {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
-    {"batch_to_space_nd", "batch_to_space_nd.cl"},
-    {"binary_logical_op", "binary_logical_op.cl"},
-    {"cast", "cast.cl"},
-    {"cast_qasymm_in", "cast.cl"},
-    {"cast_qasymm_out", "cast.cl"},
-    {"comparison_op", "comparison_op.cl"},
-    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
-    {"depth_to_space", "depth_to_space.cl"},
-    {"embedding_lookup", "embedding_lookup.cl"},
-    {"exp_layer", "exp.cl"},
-    {"gather", "gather.cl"},
-    {"gather_1d", "gather.cl"},
-    {"gather_1d_out", "gather.cl"},
-    {"hashtable_lookup", "hashtable_lookup.cl"},
-    {"neg_tensor", "neg_tensor.cl"},
-    {"pad", "pad.cl"},
-    {"permute_generic", "permute_ex.cl"},
-    {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
-    {"pixelwise_div_float", "pixelwise_div_float.cl"},
-    {"pixelwise_div_int", "pixelwise_div_int.cl"},
-    {"prelu", "prelu.cl"},
-    {"prelu_qasymm8", "prelu_quantized.cl"},
-    {"reduce_min_max", "reduce_operation.cl"},
-    {"reduce_sum_mean", "reduce_operation.cl"},
-    {"squared_difference", "squared_difference.cl"},
-    {"strided_slice_ex", "strided_slice_ex.cl"},
-    {"topkv2_init", "topkv2.cl"},
-    {"topkv2_find_first_negative", "topkv2.cl"},
-    {"topkv2_reorder_negatives", "topkv2.cl"},
-    {"topkv2_store", "topkv2.cl"},
-    {"radixsort_histogram", "topkv2_radixsort.cl"},
-    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
-    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
-    {"radixsort_reorder", "topkv2_radixsort.cl"},
-    {"topkv2_quicksort", "topkv2_quicksort.cl"},
-    {"space_to_batch_4d_nchw", "space_to_batch.cl"},
-    {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
-    {"space_to_depth", "space_to_depth.cl"},
-};
-
-const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
-#ifdef EMBEDDED_KERNELS
-    {
-        "activation_layer_ex.cl",
-#include "./cl_kernels/activation_layer_ex.clembed"
-    },
-    {
-        "arg_operation.cl",
-#include "./cl_kernels/arg_operation.clembed"
-    },
-    {
-        "arithmetic_op_ex.cl",
-#include "./cl_kernels/arithmetic_op_ex.clembed"
-    },
-    {
-        "batch_to_space_nd.cl",
-#include "./cl_kernels/batch_to_space_nd.clembed"
-    },
-    {
-        "cast.cl",
-#include "./cl_kernels/cast.clembed"
-    },
-    {
-        "comparison_op.cl",
-#include "./cl_kernels/comparison_op.clembed"
-    },
-    {
-        "comparison_op_quantized.cl",
-#include "./cl_kernels/comparison_op_quantized.clembed"
-    },
-    {
-        "embedding_lookup.cl",
-#include "./cl_kernels/embedding_lookup.clembed"
-    },
-    {
-        "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
-    },
-    {
-        "exp.cl",
-#include "./cl_kernels/exp.clembed"
-    },
-    {
-        "gather.cl",
-#include "./cl_kernels/gather.clembed"
-    },
-    {
-        "hashtable_lookup.cl",
-#include "./cl_kernels/hashtable_lookup.clembed"
-    },
-    {
-        "helpers.h",
-#include "./cl_kernels/helpers.hembed"
-    },
-    {
-        "helpers_asymm.h",
-#include "./cl_kernels/helpers_asymm.hembed"
-    },
-    {
-        "binary_logical_op.cl",
-#include "./cl_kernels/binary_logical_op.clembed"
-    },
-    {
-        "neg_tensor.cl",
-#include "./cl_kernels/neg_tensor.clembed"
-    },
-    {
-        "pad.cl",
-#include "./cl_kernels/pad.clembed"
-    },
-    {
-        "pixelwise_div_float.cl",
-#include "./cl_kernels/pixelwise_div_float.clembed"
-    },
-    {
-        "pixelwise_div_int.cl",
-#include "./cl_kernels/pixelwise_div_int.clembed"
-    },
-    {
-        "prelu.cl",
-#include "./cl_kernels/prelu.clembed"
-    },
-    {
-        "prelu_quantized.cl",
-#include "./cl_kernels/prelu_quantized.clembed"
-    },
-    {
-        "reduce_operation.cl",
-#include "./cl_kernels/reduce_operation.clembed"
-    },
-    {
-        "space_to_batch.cl",
-#include "./cl_kernels/space_to_batch.clembed"
-    },
-    {
-        "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
-    },
-    {
-        "squared_difference.cl",
-#include "./cl_kernels/squared_difference.clembed"
-    },
-    {
-        "strided_slice_ex.cl",
-#include "./cl_kernels/strided_slice_ex.clembed"
-    },
-    {
-        "topkv2.cl",
-#include "./cl_kernels/topkv2.clembed"
-    },
-    {
-        "topkv2_radixsort.cl",
-#include "./cl_kernels/topkv2_radixsort.clembed"
-    },
-    {
-        "topkv2_quicksort.cl",
-#include "./cl_kernels/topkv2_quicksort.clembed"
-    },
-    {
-        "permute_ex.cl",
-#include "./cl_kernels/permute_ex.clembed"
-    },
-
-#endif /* EMBEDDED_KERNELS */
-};
-
-CLKernelLibraryEx::CLKernelLibraryEx()
-    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
-{
-  opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
-                         // CLKernelLibraryEx is built
-}
-
-CLKernelLibraryEx &CLKernelLibraryEx::get()
-{
-  static CLKernelLibraryEx _kernel_library;
-  return _kernel_library;
-}
-
-Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
-                                        const StringSet &build_options_set) const
-{
-  // Find which program contains the kernel
-  auto kernel_program_it = _kernel_program_map.find(kernel_name);
-
-  if (_kernel_program_map.end() == kernel_program_it)
-  {
-    ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
-  }
-  std::string concat_str;
-
-  if (fp16_supported())
-  {
-    concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
-  }
-
-  if (get_cl_version(_device) == CLVersion::CL20)
-  {
-    concat_str += " -cl-std=CL2.0 ";
-  }
-  else if (arm_non_uniform_workgroup_supported(_device))
-  {
-    concat_str += " -cl-arm-non-uniform-work-group-size ";
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
-  }
-
-  // Check if the program has been built before with same build options.
-  const std::string program_name = kernel_program_it->second;
-  const std::string build_options = stringify_set(build_options_set) + concat_str;
-
-  const std::string built_program_name = program_name + "_" + build_options;
-  auto built_program_it = _built_programs_map.find(built_program_name);
-
-  cl::Program cl_program;
-
-  if (_built_programs_map.end() != built_program_it)
-  {
-    // If program has been built, retrieve to create kernel from it
-    cl_program = built_program_it->second;
-  }
-  else
-  {
-    // Get program
-    Program program = load_program(program_name);
-
-    // Build program
-    cl_program = program.build(build_options);
-
-    // Add built program to internal map
-    _built_programs_map.emplace(built_program_name, cl_program);
-  }
-
-  // Create and return kernel
-  return Kernel(kernel_name, cl_program);
-}
-
-void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
-                                          cl::Program program)
-{
-  _built_programs_map.emplace(built_program_name, program);
-}
-
-bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); }
-
-bool CLKernelLibraryEx::int64_base_atomics_supported() const
-{
-  return device_supports_extension(_device, "cl_khr_int64_base_atomics");
-}
-
-const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
-{
-  const auto program_it = _programs_map.find(program_name);
-
-  if (program_it != _programs_map.end())
-  {
-    return program_it->second;
-  }
-
-  Program program;
-
-#ifdef EMBEDDED_KERNELS
-  const auto program_source_it = _program_source_map.find(program_name);
-
-  if (_program_source_map.end() == program_source_it)
-  {
-    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
-  }
-
-  program = Program(_context, program_name, program_source_it->second);
-#else  /* EMBEDDED_KERNELS */
-  // Check for binary
-  std::string source_name = _kernel_path + program_name;
-  std::string binary_name = source_name + "bin";
-
-  if (std::ifstream(binary_name).is_open())
-  {
-    const std::string program_binary = read_file(binary_name, true);
-    program = Program(_context, _device, program_name,
-                      std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
-  }
-  else if (std::ifstream(source_name).is_open())
-  {
-    program = Program(_context, program_name, read_file(source_name, false));
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
-  }
-#endif /* EMBEDDED_KERNELS */
-
-  // Insert program to program map
-  const auto new_program = _programs_map.emplace(program_name, std::move(program));
-
-  return new_program.first->second;
-}
-
-std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
-{
-  std::string concat_set;
-
-#ifndef EMBEDDED_KERNELS
-  concat_set += "-I" + _kernel_path + " ";
-#endif /* EMBEDDED_KERNELS */
-
-  // Concatenate set
-  for (const auto &el : s)
-  {
-    concat_set += " " + el;
-  }
-
-  return concat_set;
-}
-
-std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
-{
-  const auto program_source_it = _program_source_map.find(program_name);
-
-  if (program_source_it == _program_source_map.end())
-  {
-    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
-  }
-
-  return program_source_it->second;
-}
-
-size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
-{
-  size_t result;
-
-  size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
-  ARM_COMPUTE_ERROR_ON_MSG(
-      err != 0,
-      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
-  ARM_COMPUTE_UNUSED(err);
-
-  return result;
-}
-
-cl::NDRange CLKernelLibraryEx::default_ndrange() const
-{
-  //    GPUTarget   _target = get_target_from_device(_device);
-  cl::Device device = cl::Device::getDefault();
-  GPUTarget _target = get_target_from_device(device);
-  cl::NDRange default_range;
-
-  switch (_target)
-  {
-    case GPUTarget::MIDGARD:
-    case GPUTarget::T600:
-    case GPUTarget::T700:
-    case GPUTarget::T800:
-      default_range = cl::NDRange(128u, 1);
-      break;
-    default:
-      default_range = cl::NullRange;
-  }
-
-  return default_range;
-}
-
-std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }
diff --git a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp b/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp
deleted file mode 100644
index cbda169fb..000000000
--- a/libs/ARMComputeEx/src/core/CL/OpenCLEx.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/OpenCLEx.h"
-
-#include <dlfcn.h>
-#include <iostream>
-
-namespace arm_compute
-{
-CLSymbolsEx &CLSymbolsEx::get()
-{
-  static CLSymbolsEx symbols;
-  return symbols;
-}
-
-bool CLSymbolsEx::load_default()
-{
-  static const std::vector<std::string> libraries{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
-
-  if (_loaded.first)
-  {
-    return _loaded.second;
-  }
-
-  // Indicate that default loading has been tried
-  _loaded.first = true;
-
-  for (const auto &lib : libraries)
-  {
-    if (load(lib))
-    {
-      return true;
-    }
-  }
-
-  std::cerr << "Couldn't find any OpenCL library.\n";
-  return false;
-}
-
-bool CLSymbolsEx::load(const std::string &library)
-{
-  void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL);
-
-  if (handle == nullptr)
-  {
-    std::cerr << "Can't load " << library << ": " << dlerror() << "\n";
-    // Set status of loading to failed
-    _loaded.second = false;
-    return false;
-  }
-
-#define LOAD_FUNCTION_PTR(func_name, handle) \
-  func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
-
-  LOAD_FUNCTION_PTR(clGetEventInfo, handle);
-  LOAD_FUNCTION_PTR(clSetEventCallback, handle);
-
-#undef LOAD_FUNCTION_PTR
-
-  // Don't call dlclose(handle) or all the symbols will be unloaded !
-
-  // Disable default loading and set status to successful
-  _loaded = std::make_pair(true, true);
-
-  return true;
-}
-
-} // namespace arm_compute
-
-cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size,
-                      void *param_value, size_t *param_value_size_ret)
-{
-  arm_compute::CLSymbolsEx::get().load_default();
-  auto func = arm_compute::CLSymbolsEx::get().clGetEventInfo_ptr;
-  if (func != nullptr)
-  {
-    return func(event, param_name, param_value_size, param_value, param_value_size_ret);
-  }
-  else
-  {
-    return CL_OUT_OF_RESOURCES;
-  }
-}
-
-cl_int clSetEventCallback(cl_event event, cl_int command_exec_callback_type,
-                          void(CL_CALLBACK *pfn_ev_notify)(cl_event ev, cl_int ev_cmd_exec_status,
-                                                           void *user_data),
-                          void *user_data)
-{
-  arm_compute::CLSymbolsEx::get().load_default();
-  auto func = arm_compute::CLSymbolsEx::get().clSetEventCallback_ptr;
-  if (func != nullptr)
-  {
-    return func(event, command_exec_callback_type, pfn_ev_notify, user_data);
-  }
-  else
-  {
-    return CL_OUT_OF_RESOURCES;
-  }
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
deleted file mode 100644
index f54c7bde3..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#define CONST_ONE 1.f
-#define DIV_OP(a, b) ((a) / (b))
-#define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a)))
-
-// Inverse Square-root Activation
-inline TYPE rsqrt_op(TYPE x)
-{
-    return RSQRT_OP(x);
-}
-
-#define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
-
-#if defined(ACT)
-
-/** This performs an activation function floating point inputs.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void activation_layer_ex(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-
-    // Perform activation
-    data = ACTIVATION_OP(ACT, data);
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-#endif /* defined(ACT) */
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 9a6921d7c..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U32
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  axis                                 Axis through which reduction occurs for max value index
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input),
-                     TENSOR4D_DECLARATION(output),
-		     const int axis,
-		     const int dim)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
-
-    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    DATA_TYPE tval = value;
-    int idx = 0;
-    for(int i = 1; i < dim; ++i)
-    {
-      indices[axis] = i;
-
-      #if OP_CODE == 1 // ArgMax
-        value = max(value, *((__global DATA_TYPE *)
-		             tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-      #elif OP_CODE == 2 //ArgMin
-        value = min(value, *((__global DATA_TYPE *)
-		    	     tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-      #else
-	return;
-
-      #endif
-
-      if(tval!=value)
-      {
-        idx = indices[axis];
-        tval = value;
-      }
-    }
-
-    *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
deleted file mode 100644
index 2ed698951..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_ex.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** This function subtracts one tensors from another.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8, S16
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8, S16
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8, S16
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_sub_ex(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
-    // Calculate and store result
-    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index 5cd0a4309..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * The following computations will be performed:
- *
- *  -# Add offset terms to inputs
-    -# Get scaled value of two inputs
- *  -# Add inputs
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_add_qasymm8(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(int, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-    VEC_DATA_TYPE(int, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
-    // Get scaled value of two inputs
-    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
-    VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
-    VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
-    VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
-    VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
-    VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
-    // Add inputs and multiply with a multiplier smaller than 1
-    VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
-    VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
-    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-// TODO: Apply min-max BOUND to support fuse with relu.
-/*
-#if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
-#endif // defined(MAX_BOUND)
-*/
-
-    // Store result
-    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
-                     0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
deleted file mode 100644
index ad6a48a02..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/batch_to_space_nd.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
-/** Perform batch to space rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor batch should be given as a preprocessor argument using -DBATCH_OUT=size. e.g. -DBATCH_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE0=size. e.g. -DBLOCK_SIZE0=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nd(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int out_index[4]={0};
-    int in_index[4]={0};
-
-    out_index[0] = get_global_id(0);//W
-    out_index[1] = get_global_id(1);//H
-    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
-    out_index[3] = get_global_id(2) / DEPTH_OUT;//N
-
-    in_index[0] = out_index[0]/BLOCK_SIZE1;
-    in_index[1] = out_index[1]/BLOCK_SIZE0;
-    in_index[2] = out_index[2];
-    in_index[3] = out_index[3] + ((out_index[1] % BLOCK_SIZE0) * BLOCK_SIZE0 + out_index[0] % BLOCK_SIZE1) * BATCH_OUT;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE0) && defined(BLOCK_SIZE1) && defined(BATCH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
deleted file mode 100644
index bea61f53e..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(OP_CODE) && defined(DATA_TYPE)
-/** returns truth value of the two input tensors for BINARY LOGICAL OP.
- * where BINARY LOGICAL OP can be AND, OR.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[in]  input2_ptr                            Pointer to the source tensor.Supported data types: QASYMM8
- * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- */
-__kernel void binary_logical_op(
-    TENSOR3D_DECLARATION(input1),
-    TENSOR3D_DECLARATION(input2),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
-    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    #if OP_CODE == 1 // LOGICAL AND
-     VSTORE(VEC_SIZE)
-     (CONVERT(VLOAD(VEC_SIZE)
-	     (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
-             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
-
-    #elif OP_CODE == 2 // LOGICAL OR
-     VSTORE(VEC_SIZE)
-     (CONVERT(VLOAD(VEC_SIZE)
-	     (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
-             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
-
-    #else // OP NOT SUPPORTED
-     return
-
-    #endif
-}
-#endif //if defined(OP_CODE) && defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 3d4675e5d..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE    1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET   0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-                             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-                     0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast_qasymm_in(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
-        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-    VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
-    VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
-    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-                     0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast_qasymm_out(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data =
-        VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-    VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-    VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-    VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
-    VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
-    VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-                     0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
deleted file mode 100644
index 765072556..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
-/** Returns truth value of comparison operators.
- * Comparison operators may be equal, not_equal etc.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN, -DDATA_TYPE_OUT,
- * e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT = uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void comparison_op(
-    TENSOR3D_DECLARATION(input1),
-    TENSOR3D_DECLARATION(input2),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1);
-    Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    #if OP_CODE == 1 //EQUAL
-     VSTORE(VEC_SIZE)
-     (CONVERT(VLOAD(VEC_SIZE)
-             (0, (__global DATA_TYPE_IN *)input1.ptr) == VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
-             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),0, (__global DATA_TYPE_OUT *)output.ptr);
-
-    #elif OP_CODE == 2 //NOT_EQUAL
-     VSTORE(VEC_SIZE)
-     (CONVERT(VLOAD(VEC_SIZE)
-             (0, (__global DATA_TYPE_IN *)input1.ptr) != VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input2.ptr),
-             VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
-
-    #else // OP NOT SUPPORTED
-     return;
-
-    #endif
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
deleted file mode 100644
index 1eb305f7b..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#ifndef VEC_SIZE
-#define VEC_SIZE  1
-#endif
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-
-/** Returns the truth value of comparison .
- * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void comparison_op_qasymm8(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
-
-    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
-    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
-
-    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-
-    #if OPCODE == 1 //EQUAL QUANTIZED
-      VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
-
-    #elif OPCODE == 2 //NOT EQUAL QUANTIZED
-      VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
-
-    #else // OP NOT SUPPORTED
-      return;
-
-    #endif
-}
-#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(DATA_TYPE_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index fef2243e7..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void depth_to_space(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int out_index[4]={0};
-    int in_index[4]={0};
-
-    out_index[0] = get_global_id(0);//W
-    out_index[1] = get_global_id(1);//H
-    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
-    out_index[3] = get_global_id(2) / DEPTH_OUT;//B
-
-    in_index[0] = out_index[0]/BLOCK_SIZE;
-    in_index[1] = out_index[1]/BLOCK_SIZE;
-    in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
-    in_index[3] = out_index[3];
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
deleted file mode 100644
index 348458fe9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
-/** Perform embedding_lookup of input tensor
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-
-__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
-                               TENSOR4D_DECLARATION(output),
-                               VECTOR_DECLARATION(lookups))
-{
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-
-    //lookup ids for based on the tensor dimensions
-    int lup_id[4] = {0};
-
-    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
-	                       :get_global_id(0);
-    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
-	                       :get_global_id(1);
-    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
-	                       :get_global_id(2)%DEPTH_OUT;
-    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                               :get_global_id(2) / DEPTH_OUT;
-
-    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
-              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
deleted file mode 100644
index 69d94f30a..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Perform an exponential operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void exp_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VSTORE(VEC_SIZE)
-    (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
deleted file mode 100644
index 6b767d6c9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-/** Perform gather
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in]  input1_ptr                            Pointer to the first source tensor. Supported data types: U8/S32/F32
- * @param[in]  input1_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input1_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  input2_ptr                            Pointer to the first source tensor. Supported data types: U32
- * @param[in]  input2_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input2_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gather(IMAGE_DECLARATION(input1),
-                    VECTOR_DECLARATION(input2),
-                    IMAGE_DECLARATION(output))
-{
-    Image in1  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
-    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
-    Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output);
-
-    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
-    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
-
-    //TODO: performance tuning for memcopy
-    int index = in2_data.s0;
-    int stride=input1_stride_y/input1_stride_x;
-
-    for(int i=0; i<stride; i++){
-        *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index));
-    }
-}
-
-__kernel void gather_1d_out(IMAGE_DECLARATION(input1),
-                    VECTOR_DECLARATION(input2),
-                    VECTOR_DECLARATION(output))
-{
-    Image in1  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1);
-    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
-    Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
-
-    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
-    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
-
-    //TODO: performance tuning for memcopy
-    int index = in2_data.s0;
-    int stride=input1_stride_y/input1_stride_x;
-
-    for(int i=0; i<stride; i++){
-        *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index));
-    }
-}
-
-__kernel void gather_1d(VECTOR_DECLARATION(input1),
-                    VECTOR_DECLARATION(input2),
-                    VECTOR_DECLARATION(output))
-{
-    Vector in1  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1);
-    Vector in2  = CONVERT_TO_VECTOR_STRUCT(input2);
-    Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output);
-
-    VEC_DATA_TYPE(DATA_TYPE_IN2, 2)
-    in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2));
-
-    //TODO: performance tuning for memcopy
-    int index = in2_data.s0;
-    *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
deleted file mode 100644
index ed7409852..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
-/** Perform hashtable_lookup of input tensor
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
- * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
-                               TENSOR4D_DECLARATION(output),
-                               VECTOR_DECLARATION(lookups))
-{
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-
-    int lup_id[4] = {0};
-
-    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
-	                       :get_global_id(0);
-    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
-	                       :get_global_id(1);
-    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
-	                       :get_global_id(2)%DEPTH_OUT;
-    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                               :get_global_id(2) / DEPTH_OUT;
-
-    if (lup_id[NUM_DIMS-1] < 0)
-    {
-      VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
-      return;
-    }
-
-    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
-              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
deleted file mode 100644
index 0e123ae0a..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HELPER_H
-#define ARM_COMPUTE_HELPER_H
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
-    defined(cl_arm_integer_dot_product_accumulate_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
-       // defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
-
-#define EXPAND(x) x
-
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-
-#define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
-
-#define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
-
-#define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CL_VEC_DATA_TYPE_STR(type, size) type##size
-#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
-
-#define VECTOR_DECLARATION(name)                                        \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
-      uint name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name)                                                               \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name)                                                            \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
-      uint name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name)                                                            \
-  __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
-      uint name##_step_w, uint name##_offset_first_element_in_bytes
-
-#define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
-  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                             name##_step_x)
-
-#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
-  update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
-
-#define CONVERT_TO_IMAGE_STRUCT(name)                                                          \
-  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                            name##_step_x, name##_stride_y, name##_step_y)
-
-#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)                                                     \
-  update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
-                            name##_stride_y, 0)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
-  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
-                                          name##_stride_x, name##_step_x, name##_stride_y,  \
-                                          name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                             \
-  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes,        \
-                                          name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
-                                          name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
-  update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
-                                          name##_stride_x, name##_step_x, name##_stride_y,  \
-                                          name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                          \
-  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
-                               name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                  \
-  update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               0, name##_stride_y, 0, name##_stride_z, 0)
-
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                \
-  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
-                               name##_step_z, name##_stride_w, name##_step_w, mod_size)
-
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                        \
-  update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
-                               0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0,     \
-                               mod_size)
-
-/** Structure to hold Vector information */
-typedef struct Vector
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-} Vector;
-
-/** Structure to hold Image information */
-typedef struct Image
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-} Image;
-
-/** Structure to hold 3D tensor information */
-typedef struct Tensor3D
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
-} Tensor3D;
-
-/** Structure to hold 4D tensor information */
-typedef struct Tensor4D
-{
-  __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
-  int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-  int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-  int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-  int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
-  int stride_w;                      /**< Stride of the image in W dimension (in bytes) */
-} Tensor4D;
-
-/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
- * data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
- * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- *
- * @return An image object
- */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
-                                         uint stride_x, uint step_x)
-{
-  Vector vector = {
-      .ptr = ptr,
-      .offset_first_element_in_bytes = offset_first_element_in_bytes,
-      .stride_x = stride_x,
-  };
-  vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
-  return vector;
-}
-
-/** Wrap image information into an Image structure, and make the pointer point at this workitem's
- * data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per
- * workitem(in bytes)
- *
- * @return An image object
- */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
-                                       uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
-  Image img = {.ptr = ptr,
-               .offset_first_element_in_bytes = offset_first_element_in_bytes,
-               .stride_x = stride_x,
-               .stride_y = stride_y};
-  img.ptr +=
-      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
-  return img;
-}
-
-/** Wrap 3D tensor information into an image structure, and make the pointer point at this
- * workitem's data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per
- * workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per
- * workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
-                                                     uint offset_first_element_in_bytes,
-                                                     uint stride_x, uint step_x, uint stride_y,
-                                                     uint step_y, uint stride_z, uint step_z)
-{
-  Image img = {.ptr = ptr,
-               .offset_first_element_in_bytes = offset_first_element_in_bytes,
-               .stride_x = stride_x,
-               .stride_y = stride_y};
-  img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
-             get_global_id(1) * step_y + get_global_id(2) * step_z;
-  return img;
-}
-
-/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
- * workitem's data.
- *
- * @param[in] ptr                           Pointer to the starting postion of the buffer
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per
- * workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per
- * workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per
- * workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
-                                             uint offset_first_element_in_bytes, uint stride_x,
-                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
-                                             uint step_z)
-{
-  Tensor3D tensor = {.ptr = ptr,
-                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
-                     .stride_x = stride_x,
-                     .stride_y = stride_y,
-                     .stride_z = stride_z};
-  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
-                get_global_id(1) * step_y + get_global_id(2) * step_z;
-  return tensor;
-}
-
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
-                                             uint offset_first_element_in_bytes, uint stride_x,
-                                             uint step_x, uint stride_y, uint step_y, uint stride_z,
-                                             uint step_z, uint stride_w, uint step_w, uint mod_size)
-{
-  Tensor4D tensor = {.ptr = ptr,
-                     .offset_first_element_in_bytes = offset_first_element_in_bytes,
-                     .stride_x = stride_x,
-                     .stride_y = stride_y,
-                     .stride_z = stride_z,
-                     .stride_w = stride_w};
-
-  tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
-                get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
-                (get_global_id(2) / mod_size) * step_w;
-  return tensor;
-}
-
-/** Get the pointer position of a Vector
- *
- * @param[in] vec Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- */
-inline __global const uchar *vector_offset(const Vector *vec, int x)
-{
-  return vec->ptr + x * vec->stride_x;
-}
-
-/** Get the pointer position of a Image
- *
- * @param[in] img Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- * @param[in] y   Relative Y position
- */
-inline __global uchar *offset(const Image *img, int x, int y)
-{
-  return img->ptr + x * img->stride_x + y * img->stride_y;
-}
-
-/** Get the pointer position of a Tensor3D
- *
- * @param[in] tensor Pointer to the starting position of the buffer
- * @param[in] x      Relative X position
- * @param[in] y      Relative Y position
- * @param[in] z      Relative Z position
- */
-inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
-{
-  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
-}
-
-/** Get the pointer position of a Tensor4D
- *
- * @param[in] tensor Pointer to the starting position of the buffer
- * @param[in] x      Relative X position
- * @param[in] y      Relative Y position
- * @param[in] z      Relative Z position
- * @param[in] w      Relative W position
- */
-inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
-{
-  return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
-         w * tensor->stride_w;
-}
-
-#endif // _HELPER_H
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
deleted file mode 100644
index c39138caa..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
-#define ARM_COMPUTE_HELPERS_ASYMM_H
-
-#include "helpers.h"
-
-/** Correctly-rounded-to-nearest division by a power-of-two.
- *
- * @param[in] size Size of vector.
- *
- * @return Correctly-rounded-to-nearest division by a power-of-two.
- */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
-  inline VEC_DATA_TYPE(int, size)                                                    \
-      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                  \
-    VEC_DATA_TYPE(int, size)                                                         \
-    mask = (1 << exponent) - 1;                                                      \
-    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
-    const VEC_DATA_TYPE(int, size) one = 1;                                          \
-    VEC_DATA_TYPE(int, size)                                                         \
-    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
-    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
-  }
-
-/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
- * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
- *
- * @param[in] size Size of vector.
- *
- * @return Product of two fixed-point numbers.
- */
-#define ASYMM_MULT_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                            \
-    VEC_DATA_TYPE(int, size)                                                   \
-    overflow = a == b && a == INT_MIN;                                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    a_64 = convert_long##size(a);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    b_64 = convert_long##size(b);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    ab_64 = a_64 * b_64;                                                       \
-    /* COMPMID-907 */                                                          \
-    VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
-    return select(ab_x2_high32, INT_MAX, overflow);                            \
-  }
-
-/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
-  inline VEC_DATA_TYPE(int, size)                                                                  \
-      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
-                                                                              a)                   \
-  {                                                                                                \
-    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
-    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
-    const int k_fractional_bits = 31;                                                              \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x = a + (1 << (k_fractional_bits - 3));                                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x2 = ASYMM_MULT(x, x, size);                                                                   \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x3 = ASYMM_MULT(x2, x, size);                                                                  \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
-        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
-        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
-    return constant_term +                                                                         \
-           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
-  }
-
-/** Each bit of the result is set to the corresponding bit of either then_val or
- * else_val depending on whether the corresponding bit of if_mask is set.
- * Equivalent to the VBSL instruction in ARM NEON.
- *
- * @param[in] size Size of vector.
- *
- * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding
- * bit in @p if_mask is set or not.
- */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                         \
-  inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask,  \
-                                                                VEC_DATA_TYPE(int, size) then_val, \
-                                                                VEC_DATA_TYPE(int, size) else_val) \
-  {                                                                                                \
-    return (if_mask & then_val) ^ (~if_mask & else_val);                                           \
-  }
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is zero.
- *
- * @param[in] size Size of vector.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is zero.
- */
-#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
-  {                                                                                    \
-    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                      \
-    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                      \
-    return select(all_zeros, all_ones, a == 0);                                        \
-  }
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is non-zero.
- *
- * @param[in] size Size of vector.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is non zero.
- */
-#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
-  {                                                                                        \
-    const VEC_DATA_TYPE(int, size) all_zeros = 0;                                          \
-    const VEC_DATA_TYPE(int, size) all_ones = ~0;                                          \
-    return select(all_zeros, all_ones, a != 0);                                            \
-  }
-
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
-  inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
-      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
-      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
-  {                                                                                            \
-    if (k_integer_bits > exponent)                                                             \
-    {                                                                                          \
-      const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
-      return ASYMM_SELECT_USING_MASK(                                                          \
-          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
-          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
-    }                                                                                          \
-                                                                                               \
-    return result;                                                                             \
-  }
-
-/** Calculates \f$ exp(x) \f$ for x < 0.
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
-  inline VEC_DATA_TYPE(int, size)                                                                 \
-      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
-  {                                                                                               \
-    const int k_fractional_bits = 31 - k_integer_bits;                                            \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    k_one_quarter = 1 << (k_fractional_bits - 2);                                                 \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    mask = k_one_quarter - 1;                                                                     \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                 \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
-        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
-    VEC_DATA_TYPE(int, size)                                                                      \
-    remainder = a_mod_quarter_minus_one_quarter - a;                                              \
-                                                                                                  \
-    result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits,        \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits,        \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits,         \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits,         \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
-                                remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
-                                size);                                                            \
-    result =                                                                                      \
-        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
-                                                                                                  \
-    if (k_integer_bits > 5)                                                                       \
-    {                                                                                             \
-      const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                     \
-      result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \
-    }                                                                                             \
-                                                                                                  \
-    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                              \
-    return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);            \
-  }
-
-/** Calculates the product of a integer value by a power of two, with either a positive exponent
- * (equivalent to an arithmetic left shift, saturating) or a negative exponent
- * (equivalent to an arithmetic right shift, rounding to nearest).
- *
- * @param[in] size Size of vector.
- *
- * @return Arithmetic left or right shift.
- */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
-  inline VEC_DATA_TYPE(int, size)                                                            \
-      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                          \
-    if (exponent < 0)                                                                        \
-    {                                                                                        \
-      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
-    }                                                                                        \
-                                                                                             \
-    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
-    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
-    int threshold = ((1 << (31 - exponent)) - 1);                                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    result = x << exponent;                                                                  \
-    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
-    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
-    return result;                                                                           \
-  }
-
-/** Calculates (a+b)/2, rounded to the nearest integer.
- * Equivalent to VRHADD in the ARM NEON instruction set.
- *
- * @param[in] size Size of vector.
- *
- * @return (a+b)/2, rounded to the nearest integer.
- */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                                           \
-      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    a64 = convert_long##size(a);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    b64 = convert_long##size(b);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sum = a64 + b64;                                                                        \
-    const VEC_DATA_TYPE(long, size) one = 1;                                                \
-    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sign = select(minus_one, one, sum >= 0);                                                \
-    return convert_int##size((sum + sign) / 2);                                             \
-  }
-
-/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
- *
- * @param[in] size Size of vector.
- *
- * @return Result in fixed-point format Q0.
- */
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
-  inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
-  {                                                                            \
-    const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
-    const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
-    VEC_DATA_TYPE(int, size)                                                   \
-    half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);               \
-    const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810;                 \
-    const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;            \
-    VEC_DATA_TYPE(int, size)                                                   \
-    x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \
-    for (int i = 0; i < 3; i++)                                                \
-    {                                                                          \
-      VEC_DATA_TYPE(int, size)                                                 \
-      half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);        \
-      VEC_DATA_TYPE(int, size)                                                 \
-      one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;  \
-      VEC_DATA_TYPE(int, size)                                                 \
-      tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);           \
-      x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);            \
-    }                                                                          \
-    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                 \
-  }
-
-/** Considering the integer value as fixed-point, change the number of integer bits and update value
- * accordingly.
- *
- * @param[in] size Size of vector.
- *
- * @return Rescaled value.
- */
-#define ASYMM_RESCALE_IMPL(size)                                                                  \
-  inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value,             \
-                                                      int src_integer_bits, int dst_integer_bits) \
-  {                                                                                               \
-    int exponent = src_integer_bits - dst_integer_bits;                                           \
-    return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
-  }
-
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
-  asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
-  ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
-  asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
-  asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
-#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
-                           remainder, size)                                                    \
-  exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \
-                           remainder)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \
-  asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \
-  asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
-  asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
-#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
-  asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
-
-ASYMM_MULT_IMPL(2)
-ASYMM_MULT_IMPL(4)
-ASYMM_MULT_IMPL(8)
-ASYMM_MULT_IMPL(16)
-
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
-ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
-
-ASYMM_SELECT_USING_MASK_IMPL(2)
-ASYMM_SELECT_USING_MASK_IMPL(4)
-ASYMM_SELECT_USING_MASK_IMPL(8)
-ASYMM_SELECT_USING_MASK_IMPL(16)
-
-ASYMM_MASK_IF_ZERO_IMPL(2)
-ASYMM_MASK_IF_ZERO_IMPL(4)
-ASYMM_MASK_IF_ZERO_IMPL(8)
-ASYMM_MASK_IF_ZERO_IMPL(16)
-
-ASYMM_MASK_IF_NON_ZERO_IMPL(2)
-ASYMM_MASK_IF_NON_ZERO_IMPL(4)
-ASYMM_MASK_IF_NON_ZERO_IMPL(8)
-ASYMM_MASK_IF_NON_ZERO_IMPL(16)
-
-EXP_BARREL_SHIFTER_IMPL(2)
-EXP_BARREL_SHIFTER_IMPL(4)
-EXP_BARREL_SHIFTER_IMPL(8)
-EXP_BARREL_SHIFTER_IMPL(16)
-
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
-ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
-
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
-ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
-
-ASYMM_ROUNDING_HALF_SUM_IMPL(2)
-ASYMM_ROUNDING_HALF_SUM_IMPL(4)
-ASYMM_ROUNDING_HALF_SUM_IMPL(8)
-ASYMM_ROUNDING_HALF_SUM_IMPL(16)
-
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
-ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
-
-ASYMM_RESCALE_IMPL(2)
-ASYMM_RESCALE_IMPL(4)
-ASYMM_RESCALE_IMPL(8)
-ASYMM_RESCALE_IMPL(16)
-
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
-\ No newline at end of file
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
deleted file mode 100644
index e3aa463db..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Performs a negation of input tensor.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: S16/S32/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- */
-__kernel void neg_tensor(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VSTORE(VEC_SIZE)
-    (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
deleted file mode 100644
index ecf4696e9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
- * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- *
- * @param[in]  pad_values                           Padding values for each of the dimensions. Only pad values for Up(for
- *                                                  batch), Top(for height), Left(for width) and Front(for depth) are
- *				                    required. Supported data type: S32
- */
-
-__kernel void pad(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output),
-     const int4 pad_values)
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int index[4]={0};
-
-    index[0] = get_global_id(0);//W
-    index[1] = get_global_id(1);//H
-    index[2] = get_global_id(2) % DEPTH_OUT;//C
-    index[3] = get_global_id(2) / DEPTH_OUT;//N
-
-    if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
-        index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
-        index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
-        index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
-    {
-        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
-    }
-    else
-    {
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
-                                              tensor4D_offset(&in, index[0] - pad_values.x,
-                                                              index[1] - pad_values.y,
-                                                              index[2] - pad_values.z,
-                                                              index[3] - pad_values.w));
-    }
- }
-
-#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
deleted file mode 100644
index 7cc8b0354..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
-/** Perform a Generic permute operation on an input tensor of Shape DCHW.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1
-6/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in b
-ytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in b
-ytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in b
-ytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_generic(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-   int out_index[4];
-   int in_index[4];
-   in_index[0] = get_global_id(0);//W
-   in_index[1] = get_global_id(1);//H
-   in_index[2] = get_global_id(2) % DEPTH_IN;//C
-   in_index[3] = get_global_id(2) / DEPTH_IN;//B
-   out_index[0] = in_index[P1];
-   out_index[1] = in_index[P2];
-   out_index[2] = in_index[P3];
-   out_index[3] = in_index[P4];
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
deleted file mode 100644
index aa05121b1..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-/** Performs a pixelwise division with float scale of either integer or float inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_div_float(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const float scale)
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-
-    // Perform division
-#ifdef DATA_TYPE_FLOAT
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-#else  /* DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
-#endif /* DATA_TYPE_FLOAT */
-
-    // Store result
-    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
deleted file mode 100644
index fdfb78003..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(SATURATE)
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
-#else // SATURATE
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
-#endif // SATURATE
-#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
-
-#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size)
-
-/** Performs a pixelwise division with integer scale of integer inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32
- */
-__kernel void pixelwise_div_int(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const uint scale)
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-
-    // Perform division and store result
-    vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
deleted file mode 100644
index ab1307e64..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to inputs
- *  -# Multiply inputs
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_qasymm8(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const float scale)
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(int, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-    VEC_DATA_TYPE(int, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
-    // Perform multiplication of two inputs
-    VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-    VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-    VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val;
-
-    // Multiply with a multiplier smaller than 1
-    out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-    out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
-    VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-// TODO: Apply min-max BOUND to support fuse with relu.
-/*
-#if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
-#endif // defined(MAX_BOUND)
-*/
-
-    // Store result
-    VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)),
-                     0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 68da2ba32..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : F16/F32
- * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void prelu(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(alpha),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D alpha  = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VSTORE(VEC_SIZE)
-      (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ?
-                       VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) :
-	               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
-       0, (__global DATA_TYPE *)output.ptr);
-
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index 7e97b7ed6..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data types : QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[in]  alpha_ptr                            Pointer to the source image. Supported Data types : QASYMM8
- * @param[in]  alpha_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  alpha_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  alpha_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  alpha_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void prelu_qasymm8(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(alpha),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
-    in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1));
-    in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2));
-
-    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-    const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32;
-    const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
-    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
deleted file mode 100644
index 8bef49363..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform reduce max/min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-__kernel void reduce_min_max(TENSOR4D_DECLARATION(input),
-                             TENSOR4D_DECLARATION(output),
-                             const int axis,
-                             const int dim)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
-
-    DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    for(int i = 1; i < dim; ++i)
-    {
-      indices[axis] = i;
-
-      #if OP_CODE == 1 // REDUCE_MAX
-       value = max(value, *((__global DATA_TYPE *)
-			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-
-      #elif OP_CODE == 2 // REDUCE_MIN
-       value = min(value, *((__global DATA_TYPE *)
-			   tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
-
-      #else // OP NOT SUPPORTED
-       return;
-
-      #endif
-    }
-
-    *((__global DATA_TYPE *)out.ptr) = value;
-}
-
-/** Perform reduce sum/mean
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using
- * -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input),
-                              TENSOR4D_DECLARATION(output),
-                              const int axis,
-                              const int dim)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int indices[4] =
-    {
-        get_global_id(0),
-        get_global_id(1),
-        get_global_id(2) % DEPTH_OUT,
-        get_global_id(2) / DEPTH_OUT,
-    };
-
-    DATA_TYPE sum_value = (DATA_TYPE)0;
-    for(int i = 0; i < dim; ++i)
-    {
-      indices[axis] = i;
-      sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-    }
-
-    #if OP_CODE == 3 // REDUCE_SUM
-     *((__global DATA_TYPE *)out.ptr) = sum_value;
-
-    #elif OP_CODE == 4 // REDUCE_MEAN
-     *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE);
-
-    #else // OP NOT SUPPORTED
-     return;
-
-    #endif
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
deleted file mode 100644
index a0fc2d5a9..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
-/** Perform space to batch with input of 4D and NCHW format
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
- * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
- * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
- * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- *
- * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
- * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
- * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
-                                     TENSOR4D_DECLARATION(output),
-                                     VECTOR_DECLARATION(block_size),
-                                     IMAGE_DECLARATION(padding_size))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int block_size_x = *((__global int *)(block_size_ptr));
-    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
-    int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
-    int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
-
-    int in_index[4] = {0, };
-    in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
-    in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
-    in_index[2] = get_global_id(2) % DEPTH_OUT;
-    in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
-
-    if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
-    {
-        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
-    }
-    else
-    {
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
-    }
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
-
-#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
-/** Perform space to batch with input of 4D and NHWC format
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
- * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
- * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
- * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
- * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
- * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
- * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
-                                     TENSOR4D_DECLARATION(output),
-                                     VECTOR_DECLARATION(block_size),
-                                     IMAGE_DECLARATION(padding_size))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
-
-    int block_size_x = *((__global int *)(block_size_ptr));
-    int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x));
-    int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
-    int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
-
-    int in_index[4] = {0, };
-    in_index[0] = get_global_id(0) * VEC_SIZE;
-    in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr));
-    in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y));
-    in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
-
-    if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
-    {
-        VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
-    }
-    else
-    {
-        VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
-                                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                         0, (__global DATA_TYPE *)out.ptr);
-    }
-}
-
-#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index f6977045a..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
-t_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
-bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_depth(
-     TENSOR4D_DECLARATION(input),
-     TENSOR4D_DECLARATION(output))
- {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    int out_index[4]={0};
-    int in_index[4]={0};
-
-    in_index[0] = get_global_id(0);//W
-    in_index[1] = get_global_id(1);//H
-    in_index[2] = get_global_id(2) % DEPTH_IN;//C
-    in_index[3] = get_global_id(2) / DEPTH_IN;//B
-
-    out_index[0] = in_index[0]/BLOCK_SIZE;
-    out_index[1] = in_index[1]/BLOCK_SIZE;
-    out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
-    out_index[3] = in_index[3];
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
- }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
deleted file mode 100644
index 3e1a5c97f..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns true value of squared_difference of two tensors.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[in]  input2_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source image
- *
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: F16/F32
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void squared_difference(
-    TENSOR3D_DECLARATION(input1),
-    TENSOR3D_DECLARATION(input2),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input1  = CONVERT_TO_TENSOR3D_STRUCT(input1);
-    Tensor3D input2  = CONVERT_TO_TENSOR3D_STRUCT(input2);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    sq_diff = diff * diff;
-
-    VSTORE(VEC_SIZE)
-    (sq_diff, 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
deleted file mode 100644
index b39c55b96..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice_ex.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "helpers.h"
-
-#if defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
-/** Extracts a strided slice up to 4-dimensions
- *
- * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  starts                               The stride of X dimension of input tensor to be sliced. Supported data types: S32
- * @param[in]  strides                              The stride of Y dimension of input tensor to be sliced. Supported data types: S32
- */
-__kernel void strided_slice_ex(TENSOR4D_DECLARATION(input),
-                               TENSOR4D_DECLARATION(output),
-                               const int4 starts,
-                               const int4 strides)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-    int4 indices_in =
-    {
-            starts.x + (strides.x * get_global_id(0)),
-            starts.y + (strides.y * get_global_id(1)),
-            starts.z + (strides.z * (get_global_id(2) % DEPTH_OUT)),
-            starts.w + (strides.w * (get_global_id(2) / DEPTH_OUT)),
-    };
-    *((__global ELEMENT_DATA_TYPE *)out.ptr) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&in, indices_in.x, indices_in.y, indices_in.z, indices_in.w));
-}
-#endif // defined(ELEMENT_DATA_TYPE) && defined(DEPTH_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
deleted file mode 100644
index d97f23a47..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.h"
-
-__kernel void topkv2_init(VECTOR_DECLARATION(input),
-    __global float* in_key_buf,
-    __global int* in_ind_buf,
-    const int n)
-{
-  int gid = get_global_id(0);
-  int lws = get_local_size(0);
-  int groups = get_num_groups(0);
-  int gws = lws * groups;
-  int iter = n / gws;
-
-  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-
-  for(int i = 0; i < iter; ++i)
-  {
-    int idx = i * gws + gid;
-    in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x);
-    in_ind_buf[idx] = idx;
-  }
-}
-
-__kernel void topkv2_find_first_negative(
-    __global float *out_key_buf,
-    __global int *first_negative_idx,
-    int n)
-{
-  int gid = get_global_id(0);
-
-  if( gid == n - 1 )
-  {
-    // if the last item is positive, the first negative index is n.
-    if( out_key_buf[gid] > 0.f )
-      *first_negative_idx = n;
-  } else if ( gid == 0 ) {
-    // if the first item is negative, set it 0.
-    if( out_key_buf[gid] < 0.f )
-      *first_negative_idx = 0;
-  } else {
-    // if its left is positive and it is negative, then it is the first negative item.
-    if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f )
-      *first_negative_idx = gid;
-  }
-}
-
-__kernel void topkv2_reorder_negatives(
-    __global float* in_key_buf,
-    __global float* out_key_buf,
-    __global float* in_ind_buf,
-    __global float* out_ind_buf,
-    __global int* first_negative_idx,
-    int n)
-{
-  int gid = get_global_id(0);
-
-  int num_negs = n - *first_negative_idx;
-  int in_idx;
-
-  if( gid < num_negs ) {
-    in_idx = n - 1 - gid;
-  } else {
-    in_idx = gid - num_negs;
-  }
-
-  out_key_buf[gid] = in_key_buf[in_idx];
-  out_ind_buf[gid] = in_ind_buf[in_idx];
-}
-
-__kernel void topkv2_store(
-    VECTOR_DECLARATION(values),
-    VECTOR_DECLARATION(indices),
-    __global float *out_key_buf,
-    __global int *out_ind_buf,
-    int n)
-{
-  int gid = get_global_id(0);
-
-  Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values);
-  Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
-
-  int idx = n - 1 - gid;
-
-  *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx];
-  *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx];
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
deleted file mode 100644
index 0292fab04..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.h"
-
-__global inline float* get_vec_elem(Vector* vec, int idx)
-{
-  return (__global float*)(vec->ptr + idx * vec->stride_x);
-}
-
-__global inline int* get_vec_elem_int(Vector* vec, int idx)
-{
-  return (__global int*)(vec->ptr + idx * vec->stride_x);
-}
-
-// A utility function to swap two elements
-void swap(__global float *a, __global float *b)
-{
-    float t = *a;
-    *a = *b;
-    *b = t;
-}
-
-void swap_idx(__global int *a, __global int *b)
-{
-    int t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/* This function is same in both iterative and recursive*/
-int partition (Vector* arr, __global int* indices, int l, int h)
-{
-    float x = *get_vec_elem(arr, h);
-    int i = (l - 1);
-
-    for (int j = l; j <= h- 1; j++)
-    {
-        if (*get_vec_elem(arr, j) >= x)
-        {
-            i++;
-            swap (get_vec_elem(arr,i), get_vec_elem(arr,j));
-            swap_idx(&indices[i], &indices[j]);
-        }
-    }
-    swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h));
-    swap_idx(&indices[i + 1], &indices[h]);
-    return (i + 1);
-}
-
-/* A[] --> Array to be sorted,
-   l  --> Starting index,
-   h  --> Ending index */
-void quickSortIterative (Vector* arr, __global int* indices, 
-    __global int *stack, int l, int h)
-{
-    // Create an auxiliary stack
-
-    // initialize top of stack
-    int top = -1;
-
-    // push initial values of l and h to stack
-    stack[ ++top ] = l;
-    stack[ ++top ] = h;
-
-    // Keep popping from stack while is not empty
-    while ( top >= 0 )
-    {
-        // Pop h and l
-        h = stack[ top-- ];
-        l = stack[ top-- ];
-
-        // Set pivot element at its correct position
-        // in sorted array
-        int p = partition( arr, indices, l, h );
-
-        // If there are elements on left side of pivot,
-        // then push left side to stack
-        if ( p-1 > l )
-        {
-            stack[ ++top ] = l;
-            stack[ ++top ] = p - 1;
-        }
-
-        // If there are elements on right side of pivot,
-        // then push right side to stack
-        if ( p+1 < h )
-        {
-            stack[ ++top ] = p + 1;
-            stack[ ++top ] = h;
-        }
-    }
-}
-
-__kernel void topkv2_quicksort(VECTOR_DECLARATION(input),
-    VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices),
-    __global int* indices, __global int* temp_stack, int k, int n)
-{
-  Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input);
-  Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values);
-  Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices);
-
-  for( int i = 0; i < n; ++i )
-  {
-    indices[i] = i;
-  }
-
-  quickSortIterative(&input, indices, temp_stack, 0, n-1);
-
-  // extract k items.
-  for(int i = 0; i < k; ++i)
-  {
-    *get_vec_elem(&topk_values, i)  = *get_vec_elem(&input, i);
-    *get_vec_elem_int(&topk_indices, i) = indices[i];
-  }
-}
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index c2c2d89a4..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float* in_key_buf,
-      __global int* d_Histograms,
-      const int pass,
-      __local int* loc_histo,
-      const int n)
-{
-  int it = get_local_id(0);  // i local number of the processor
-  int ig = get_global_id(0); // global number = i + g I
-
-  int gr = get_group_id(0); // g group number
-
-  int groups = get_num_groups(0);
-  int items  = get_local_size(0);
-
-  // set the local histograms to zero
-  for(int ir=0;ir<_RADIX;ir++){
-    loc_histo[ir * items + it] = 0;
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // range of keys that are analyzed by the work item
-  int size= n/groups/items; // size of the sub-list
-  int start= ig * size; // beginning of the sub-list
-
-  unsigned int key;
-  int shortkey,k;
-
-  // compute the index
-  // the computation depends on the transposition
-  for(int j = 0; j < size ; j++) {
-#ifdef TRANSPOSE
-    k= groups * items * j + ig;
-#else
-    k=j+start;
-#endif
-
-    key = *((__global unsigned int*)(in_key_buf + k));
-
-    // extract the group of _BITS bits of the pass
-    // the result is in the range 0.._RADIX-1
-    shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
-
-    // increment the local histogram
-    loc_histo[shortkey *  items + it ]++;
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // copy the local histogram to the global one
-  for(int ir=0;ir<_RADIX;ir++) {
-    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
-  }
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int* invect,
-      __global int* outvect,
-      const int nbcol,
-      const int nbrow,
-      const __global int* inperm,
-      __global int* outperm,
-      __local int* blockmat,
-      __local int* blockperm,
-      const int tilesize){
-
-  int i0 = get_global_id(0)*tilesize;  // first row index
-  int j = get_global_id(1);  // column index
-
-  int jloc = get_local_id(1);  // local column index
-
-  // fill the cache
-  for(int iloc=0;iloc<tilesize;iloc++){
-    int k=(i0+iloc)*nbcol+j;  // position in the matrix
-    blockmat[iloc*tilesize+jloc]=invect[k];
-#ifdef PERMUT
-    blockperm[iloc*tilesize+jloc]=inperm[k];
-#endif
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // first row index in the transpose
-  int j0=get_group_id(1)*tilesize;
-
-  // put the cache at the good place
-  for(int iloc=0;iloc<tilesize;iloc++){
-    int kt=(j0+iloc)*nbrow+i0+jloc;  // position in the transpose
-    outvect[kt]=blockmat[jloc*tilesize+iloc];
-#ifdef PERMUT
-      outperm[kt]=blockperm[jloc*tilesize+iloc];
-#endif
-  }
-
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float* in_key,
-    __global float* out_key,
-    __global int* d_Histograms,
-    const int pass,
-    __global int* indices_in,
-    __global int* indices_out,
-    __local int* loc_histo,
-    const int n){
-
-  int it = get_local_id(0);
-  int ig = get_global_id(0);
-
-  int gr = get_group_id(0);
-  int groups=get_num_groups(0);
-  int items=get_local_size(0);
-
-  int start= ig *(n/groups/items);
-  int size= n/groups/items;
-
-  // take the histogram in the cache
-  for(int ir=0;ir<_RADIX;ir++){
-    loc_histo[ir * items + it]=
-      d_Histograms[items * (ir * groups + gr) + it];
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  int newpos,shortkey,k,newpost;
-  unsigned int key;
-
-  for(int j= 0; j< size;j++){
-#ifdef TRANSPOSE
-      k= groups * items * j + ig;
-#else
-      k=j+start;
-#endif
-    float org_value = in_key[k];
-    key = *(__global unsigned int*)(in_key + k);
-    shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
-
-    newpos=loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
-    int ignew,jnew;
-    ignew= newpos/(n/groups/items);
-    jnew = newpos%(n/groups/items);
-    newpost = jnew * (groups*items) + ignew;
-#else
-    newpost=newpos;
-#endif
-
-    //d_outKeys[newpost]= key;  // killing line !!!
-    out_key[newpost] = org_value;
-
-#ifdef PERMUT
-    indices_out[newpost] = indices_in[k];
-#endif
-
-    newpos++;
-    loc_histo[shortkey * items + it]=newpos;
-  }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
-{
-  int it = get_local_id(0);
-  int ig = get_global_id(0);
-  int decale = 1;
-  int n=get_local_size(0) * 2 ;
-  int gr=get_group_id(0);
-
-  // load input into local memory
-  // up sweep phase
-  temp[2*it] = histo[2*ig];
-  temp[2*it+1] = histo[2*ig+1];
-
-  // parallel prefix sum (algorithm of Blelloch 1990)
-  for (int d = n>>1; d > 0; d >>= 1){
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (it < d){
-      int ai = decale*(2*it+1)-1;
-      int bi = decale*(2*it+2)-1;
-      temp[bi] += temp[ai];
-    }
-    decale *= 2;
-  }
-
-  // store the last element in the global sum vector
-  // (maybe used in the next step for constructing the global scan)
-  // clear the last element
-  if (it == 0) {
-    globsum[gr]=temp[n-1];
-    temp[n - 1] = 0;
-  }
-
-  // down sweep phase
-  for (int d = 1; d < n; d *= 2){
-    decale >>= 1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (it < d){
-      int ai = decale*(2*it+1)-1;
-      int bi = decale*(2*it+2)-1;
-
-      int t = temp[ai];
-      temp[ai] = temp[bi];
-      temp[bi] += t;
-    }
-
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // write results to device memory
-
-  histo[2*ig] = temp[2*it];
-  histo[2*ig+1] = temp[2*it+1];
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
-{
-  int ig = get_global_id(0);
-  int gr=get_group_id(0);
-
-  int s;
-
-  s=globsum[gr];
-
-  // write results to device memory
-  histo[2*ig] += s;
-  histo[2*ig+1] += s;
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
deleted file mode 100644
index 1fdd2f98f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/UtilsEx.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ActivationLayerInfoEx &act_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                       DataType::F16, DataType::F32);
-
-  // Checks performed when output is configured
-  if ((output != nullptr) && (output->total_size() != 0))
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  if (output != nullptr)
-  {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, *input);
-  }
-
-  const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-  bool window_changed = false;
-
-  if (output != nullptr)
-  {
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-  }
-  else
-  {
-    window_changed = update_window_and_padding(
-        win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
-  }
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLActivationLayerExKernel::CLActivationLayerExKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
-{
-}
-
-void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
-                                          ActivationLayerInfoEx act_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _run_in_place = (output == nullptr) || (output == input);
-
-  if (output != nullptr)
-  {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-  }
-
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
-
-  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-  const DataType dt = input->info()->data_type();
-  float a_const = act_info.a();
-  float b_const = act_info.b();
-  int a_const_int = 0;
-  int b_const_int = 0;
-
-  // Create quantized version of constants a, b if needed
-  if (is_data_type_quantized(dt))
-  {
-    a_const_int =
-        input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
-    b_const_int =
-        input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
-  }
-
-  // Set build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(
-      ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized(dt))
-  {
-    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
-    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
-
-    const int o1 = input->info()->quantization_info().offset;
-    // Quantized value of 0 corresponds to the offset o1
-    build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
-
-    // Set scale and offset of the input and output if they have different quantization info
-    if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
-    {
-      const float s1 = input->info()->quantization_info().scale;
-      const float s2 = output->info()->quantization_info().scale;
-      const int o2 = output->info()->quantization_info().offset;
-
-      if (o1 != o2 || s1 != s2)
-      {
-        build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
-        build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
-        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
-        build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
-      }
-    }
-  }
-  else
-  {
-    build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-    build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-  }
-
-  build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
-
-  // Create kernel
-  std::string kernel_name = std::string("activation_layer_ex");
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Make sure _kernel is initialized before calling the parent's configure
-  _input = input;
-  _output = output;
-
-  // Configure kernel window
-  auto win_config =
-      validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Set config_id for enabling LWS tuning
-  _config_id = "activation_layer_ex_";
-  _config_id += lower_string(string_from_data_type(dt));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(0));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ActivationLayerInfoEx &act_info)
-{
-  const bool run_in_place = (output == nullptr) || (output == input);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(),
-                                    (run_in_place) ? nullptr : output->clone().get())
-          .first);
-
-  return Status{};
-}
-
-void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    if (!_run_in_place)
-    {
-      add_3D_tensor_argument(idx, _output, slice);
-    }
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
deleted file mode 100644
index c1a2ad0be..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(argminmax_axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const uint32_t argminmax_axis, ArgOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
-                                                       DataType::U8);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match argminmax_axis");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      argminmax_axis >= 0 && argminmax_axis < num_dimensions,
-      "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
-  return Status{};
-}
-
-} // namespace
-
-CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
-
-void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                  const uint32_t argminmax_axis, ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
-
-  _input = input;
-  _output = output;
-  _argminmax_axis = argminmax_axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
-
-  // Construct kernel name for argmax and argmin based on axis
-  std::string kernel_name = "arg_op";
-  int op_code = 0;
-  if (op == ArgOperation::MAX)
-  {
-    op_code = 1;
-  }
-  else if (op == ArgOperation::MIN)
-  {
-    op_code = 2;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                   const uint32_t argminmax_axis, ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
-
-  return Status{};
-}
-
-void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _argminmax_axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
deleted file mode 100644
index 1c505b4d5..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output, ConvertPolicy policy)
-{
-  ARM_COMPUTE_UNUSED(policy);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        output->data_type() == DataType::U8 &&
-            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-        "Output can only be U8 if both inputs are U8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
-                                                        ITensorInfo *output)
-{
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output, out_shape);
-
-    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output, Format::S16);
-    }
-    else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output, Format::F16);
-    }
-    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output, Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                                ICLTensor *output, ConvertPolicy policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input1->info(), input2->info(), output->info(), policy));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  const bool has_float_out = is_data_type_float(output->info()->data_type());
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
-
-  ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
-                                                 const ITensorInfo *input2,
-                                                 const ITensorInfo *output, ConvertPolicy policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
-                                                            input2->clone().get(),
-                                                            output->clone().get())
-                                  .first);
-
-  return Status{};
-}
-
-void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticSubtractionExKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
deleted file mode 100644
index b0016d23c..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t *block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
-                                  "Input Depth should be equal to Output Depth");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
-      "Input batch should be equal to (output batch * block size[0] *block size[1])");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
-                                      !(output->dimension(1) % block_size[0]),
-                                  "Output height and width should be divisible by block size[0] "
-                                  "and block_size[1] respectively");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
-                                      (output->dimension(1) == input->dimension(1) * block_size[0]),
-                                  "Output height and width should be equal to "
-                                  "input_height*blocksize[0] and input_width*blocksize[1] "
-                                  "respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                       const int32_t *block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
-  build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
-  build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_out);
-    add_4D_tensor_argument(idx, _output, slice_in);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
deleted file mode 100644
index 3d2f2c702..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
-                                                         DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                        ICLTensor *output, BinaryLogicalOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "binary_logical_op";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
-
-  int op_code = 0;
-  switch (op)
-  {
-    case BinaryLogicalOperation::AND:
-      op_code = 1;
-      break;
-    case BinaryLogicalOperation::OR:
-      op_code = 2;
-      break;
-    default:
-      throw std::runtime_error("Operation not supported, yet");
-  }
-
-  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLBinaryLogicalOpKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index bf7ebae3f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  // Create kernel
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    const float scale_in = input->info()->quantization_info().scale;
-    const int offset_in = input->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
-  }
-  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
-  {
-    const float scale_in = output->info()->quantization_info().scale;
-    const int offset_in = output->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
-  }
-  else
-  {
-    _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts));
-  }
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
deleted file mode 100644
index 5af5b16ea..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32, DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                     ICLTensor *output, const ComparisonOperation &op)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "comparison_op";
-  int op_code = 0;
-
-  switch (op)
-  {
-    case ComparisonOperation::EQUAL:
-      op_code = 1;
-      break;
-    case ComparisonOperation::NOT_EQUAL:
-      op_code = 2;
-      break;
-    default:
-      throw std::runtime_error(" Operation not supported, yet");
-  }
-
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
-  build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
-  build_opts.emplace(
-      ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
-      ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
-       (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
-  {
-    build_opts.emplace("-DOFFSET_IN1=" +
-                       support::cpp11::to_string(input1->info()->quantization_info().offset));
-    build_opts.emplace("-DOFFSET_IN2=" +
-                       support::cpp11::to_string(input2->info()->quantization_info().offset));
-    build_opts.emplace("-DSCALE_IN1=" +
-                       support::cpp11::to_string(input1->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_IN2=" +
-                       support::cpp11::to_string(input2->info()->quantization_info().scale));
-    kernel_name += "_qasymm8";
-  }
-
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input1->info()->data_type() == DataType::S16 ||
-        input2->info()->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output->info(), Format::S16);
-    }
-    else if (input1->info()->data_type() == DataType::F16 &&
-             input2->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input1->info()->data_type() == DataType::F32 ||
-             input2->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLComparisonOpKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index c386e3312..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
-                                  "Output width should be equal to (Input width * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
-                                  "Output height should be equal to (Input height * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
-                                  "Input depth should be divisible by (block size * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(2) == input->dimension(2) / (block_size * block_size),
-      "Output depth should be equal to (Input depth / (block size * block size))");
-
-  return Status{};
-}
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
-  // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
deleted file mode 100644
index 0862b78bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  input_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *lookups)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-
-  return Status{};
-}
-
-void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                        const ICLTensor *lookups)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
-  _input = input;
-  _output = output;
-  _lookups = lookups;
-
-  // Set kernel build options
-  std::stringstream kernel_name;
-  std::set<std::string> build_opts;
-  kernel_name << "embedding_lookup";
-
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-}
-
-void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  Window win_lookup;
-  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_in);
-    add_1D_tensor_argument(idx, _lookups, win_lookup);
-
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
deleted file mode 100644
index b1ee21bdc..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLExpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Auto initialize output
-  auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
-                     input->info()->quantization_info());
-
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
deleted file mode 100644
index ae2801e2b..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
-                                                       DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-
-  return Status{};
-}
-
-} // namespace
-
-CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {}
-
-void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Construct kernel name
-  std::string kernel_name = "gather";
-  if (input1->info()->num_dimensions() == 1)
-  {
-    kernel_name = "gather_1d";
-  }
-  else if (input1->info()->num_dimensions() == 2)
-  {
-    if (_output->info()->num_dimensions() == 1)
-    {
-      kernel_name = "gather_1d_out";
-    }
-  }
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
-  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
-
-  return Status{};
-}
-
-void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  if (_input1->info()->num_dimensions() == 1)
-  {
-    Window slice = window.first_slice_window_1D();
-
-    unsigned int idx = 0;
-    add_1D_tensor_argument(idx, _input1, slice);
-    add_1D_tensor_argument(idx, _input2, slice);
-    add_1D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  }
-  else if (_input1->info()->num_dimensions() == 2)
-  {
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
-
-    // Set inputs
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input1, window_collapsed);
-    add_1D_tensor_argument(idx, _input2, slice);
-    if (_output->info()->num_dimensions() == 1)
-    {
-      add_1D_tensor_argument(idx, _output, slice);
-    }
-    else
-    {
-      add_2D_tensor_argument(idx, _output, window_collapsed);
-    }
-    enqueue(queue, *this, slice);
-  }
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
deleted file mode 100644
index cd7b21c6d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  input_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLHashtableLookupKernel::CLHashtableLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                                         const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *hits)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Output's shape was not set");
-
-  ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
-                       output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
-
-  return Status{};
-}
-
-void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
-                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
-  _lookups = lookups;
-  _keys = keys;
-  _input = input;
-  _output = output;
-  _hits = hits;
-
-  // Make _lookup_indices tensor
-  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
-  _lookup_indices->allocator()->init(
-      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
-  _lookup_indices->allocator()->allocate();
-
-  // Set kernel build options
-  std::stringstream kernel_name;
-  std::set<std::string> build_opts;
-  kernel_name << "hashtable_lookup";
-
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-}
-
-void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const_cast<ICLTensor *>(_lookups)->map(queue);
-  const_cast<ICLTensor *>(_keys)->map(queue);
-  _hits->map(queue);
-  _lookup_indices->map(queue);
-
-  // Set values of hits
-  const int32_t *lookups_buf =
-      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
-  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
-  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
-  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
-
-  std::map<int32_t, size_t> key_map;
-  const size_t keys_num = _keys->info()->dimension(0);
-  for (size_t key_index = 0; key_index < keys_num; key_index++)
-  {
-    key_map[keys_buf[key_index]] = key_index;
-  }
-
-  const size_t lookups_num = _lookups->info()->dimension(0);
-  for (size_t i = 0; i < lookups_num; ++i)
-  {
-    const auto lookup_value = lookups_buf[i];
-    const auto it = key_map.find(lookup_value);
-    if (it != key_map.end())
-    {
-#if defined(DEBUG)
-      if (it->second >= lookups_num)
-        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
-#endif // defined(DEBUG)
-      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
-      hits_buf[i] = static_cast<uint8_t>(1);
-    }
-    else
-    {
-      lookup_indices_buf[i] = -1;
-      hits_buf[i] = static_cast<uint8_t>(0);
-    }
-  }
-
-  const_cast<ICLTensor *>(_lookups)->unmap(queue);
-  const_cast<ICLTensor *>(_keys)->unmap(queue);
-  _hits->unmap(queue);
-  _lookup_indices->unmap(queue);
-
-  Window win = window.collapse(ICLKernel::window(), 2, 4);
-
-  Window win_lookup;
-  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, win);
-    add_4D_tensor_argument(idx, _output, win);
-    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
-
-    enqueue(queue, *this, win);
-  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
deleted file mode 100644
index 80d99dd3b..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNegKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
-                                                DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
-                                                DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
-                                              output->info()->tensor_shape());
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  return Status{};
-}
-
-} // namespace
-
-CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
-
-  // Configure window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
deleted file mode 100644
index 12bbe910f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-  // Checks performed when output is configured
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                        NormalizationLayerInfo norm_info)
-{
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output, *input->clone());
-
-  const unsigned int norm_size = norm_info.norm_size();
-  bool is_in_map = norm_info.is_in_map();
-
-  const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
-  const BorderSize border_size = BorderSize(0, border_width);
-
-  const unsigned int num_elems_processed_per_iteration = 4;
-  const unsigned int num_elems_read_per_iteration =
-      is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
-                : num_elems_processed_per_iteration;
-
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-  // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
-  // the kernel, avoiding padding
-  AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-  output_access.set_valid_region(win, input->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
-    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
-{
-}
-
-BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
-
-void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                             NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output->info(), *input->info()->clone());
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
-
-  _input = input;
-  _output = output;
-
-  const unsigned int num_elems_processed_per_iteration = 4;
-  const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
-
-  // Set build options
-  CLBuildOptions build_opts;
-  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.add_option(
-      ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
-  build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
-  build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
-  build_opts.add_option(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
-  build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
-  build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-
-  // Create kernel
-  std::string kernel_name =
-      _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Set config_id for enabling LWS tuning
-  _config_id = "normalization_layer_";
-  _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(
-      static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(norm_info.norm_size());
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(0));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
-
-  return Status{};
-}
-
-void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
-  Window slice = window_collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 241f8ae4d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
-
-  _input = input;
-  _alpha = alpha;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "prelu";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    build_opts.emplace("-DOFF_IN1=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-    build_opts.emplace("-DOFF_IN2=" +
-                       support::cpp11::to_string(alpha->info()->quantization_info().offset));
-    build_opts.emplace("-DOFF_OUT=" +
-                       support::cpp11::to_string(output->info()->quantization_info().offset));
-    build_opts.emplace("-DSCALE_IN1=" +
-                       support::cpp11::to_string(input->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_IN2=" +
-                       support::cpp11::to_string(alpha->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_OUT=" +
-                       support::cpp11::to_string(output->info()->quantization_info().scale));
-    kernel_name += "_qasymm8";
-  }
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input->info()->data_type() == DataType::F32 ||
-             alpha->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
-  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input->info()->tensor_shape();
-  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice_input1);
-    add_3D_tensor_argument(idx, _alpha, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
deleted file mode 100644
index 99b54c822..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
-                          const ITensorInfo *pad_size_info)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
-                                      input_info->num_dimensions() <= 4,
-                                  "Pad kernel supports upto 4-D input tensor");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input_info->num_dimensions() == output_info->num_dimensions(),
-      "output tensor should have same number of dimensions as input tensor");
-
-  if (input_info->data_type() == DataType::QASYMM8)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
-                                        output_info->quantization_info(),
-                                    "The input and output quantization info are different!");
-  }
-
-  return Status{};
-}
-
-} // namespace
-
-CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
-
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
-
-  _input = input;
-  _output = output;
-  _pad_size = pad_size;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
-  build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
-  build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
-  build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
-  if (input->info()->data_type() == DataType::QASYMM8)
-  {
-    build_opts.emplace("-DZERO_VALUE=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-  }
-  else
-  {
-    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
-  }
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  _pad_size->map(queue);
-
-  // Padding values only for up, top, left and front are required based on the rank of tensor
-  int rank = _pad_size->info()->dimension(1);
-
-  auto pad_batch_up =
-      (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
-  auto pad_height_top =
-      (rank >= 2)
-          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
-          : 0;
-  auto pad_width_left = (rank >= 1)
-                            ? *reinterpret_cast<const int32_t *>(
-                                  _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
-                            : 0;
-  auto pad_depth_front =
-      (rank >= 3)
-          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
-          : 0;
-
-  _pad_size->unmap(queue);
-
-  // Pad_values which needs to be passed
-  const cl_int4 paddingValues = {
-      {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
-       static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    _kernel.setArg<cl_int4>(idx++, paddingValues);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
deleted file mode 100644
index aa094761c..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-using namespace arm_compute;
-
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
-{
-  TensorShape output_shape = input->tensor_shape();
-  permute(output_shape, perm);
-  return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const PermutationVector &perm)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-
-  const TensorShape output_shape =
-      misc::shape_calculator::compute_permutation_output_shape(*input, perm);
-
-  // Validate configured output
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-  return Status{};
-}
-} // namespace
-
-CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
-
-void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                  const PermutationVector &perm)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
-
-  _input = input;
-  _output = output;
-  _perm = perm;
-
-  const TensorShape output_shape = get_output_shape(input->info(), perm);
-  // Output auto inizialitation if not yet initialized
-  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-  // Create kernel
-  std::set<std::string> build_opts;
-
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-  // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
-  build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
-  build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
-  build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
-  build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                   const PermutationVector &perm)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
-
-  return Status{};
-}
-
-void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
deleted file mode 100644
index b985aa737..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
-                          RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_UNUSED(overflow_policy);
-  ARM_COMPUTE_UNUSED(rounding_policy);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        output->data_type() == DataType::U8 &&
-            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-        "Output can only be U8 if both inputs are U8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
-                                                        ITensorInfo *output)
-{
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output, out_shape);
-
-    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output, Format::S16);
-    }
-    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output, Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                          ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy,
-                                          RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
-                                                scale, overflow_policy, rounding_policy));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  int scale_int = -1;
-  // Extract sign, exponent and mantissa
-  int exponent = 0;
-  float normalized_mantissa = std::frexp(scale, &exponent);
-  // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-  // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <=
-  // 14
-  // Moreover, it will be negative as we deal with 1/2^n
-  if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-  {
-    // Store the positive exponent. We know that we compute 1/2^n
-    // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-    scale_int = std::abs(exponent - 1);
-  }
-
-  std::string data_type;
-  std::string compute_type;
-  // Check if it has float inputs and output
-  if (is_data_type_float(input1->info()->data_type()) ||
-      is_data_type_float(input2->info()->data_type()))
-  {
-    scale_int = -1;
-    compute_type = (input1->info()->data_type() == DataType::F32 ||
-                    input2->info()->data_type() == DataType::F32)
-                       ? "float"
-                       : "half";
-    data_type = "DATA_TYPE_FLOAT";
-  }
-  else
-  {
-    if (input1->info()->data_type() == DataType::S16 ||
-        input2->info()->data_type() == DataType::S16)
-    {
-      compute_type = "int";
-    }
-    else
-    {
-      compute_type = "ushort";
-    }
-    data_type = "DATA_TYPE_INT";
-  }
-
-  // Construct kernel name
-  std::string kernel_name = "pixelwise_div";
-  kernel_name += (scale_int >= 0) ? "_int" : "_float";
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(
-      (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()))
-          ? "-DWRAP"
-          : "-DSATURATE");
-  build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
-                                                                  : "-DROUND=_rte");
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
-  build_opts.emplace("-D" + data_type);
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Set scale argument
-  unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
-  if (scale_int >= 0)
-  {
-    _kernel.setArg(idx++, scale_int);
-  }
-  else
-  {
-    _kernel.setArg(idx++, scale);
-  }
-
-  ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                           const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy,
-                                           RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
-                                                            input2->clone().get(),
-                                                            output->clone().get())
-                                  .first);
-
-  return Status{};
-}
-
-void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseDivisionKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
deleted file mode 100644
index f581780e1..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-namespace
-{
-// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
-// are the same.
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                          ReduceOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32, DataType::S32);
-  if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
-                                    "Not support QASYMM8, yet");
-  }
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      axis >= 0 && axis < num_dimensions,
-      "axis must be greater than or equal to 0 and less than (input's rank).");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match axis");
-
-  return Status{};
-}
-} // namespace
-
-CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                        const uint32_t axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
-  // Construct kernel name
-  std::string kernel_name;
-  int op_code = 0;
-  if (op == ReduceOperation::MAX)
-  {
-    kernel_name = "reduce_min_max";
-    op_code = 1;
-  }
-  else if (op == ReduceOperation::MIN)
-  {
-    kernel_name = "reduce_min_max";
-    op_code = 2;
-  }
-  else if (op == ReduceOperation::SUM)
-  {
-    kernel_name = "reduce_sum_mean";
-    op_code = 3;
-  }
-  else if (op == ReduceOperation::MEAN)
-  {
-    kernel_name = "reduce_sum_mean";
-    op_code = 4;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                         const uint32_t axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
-  return Status{};
-}
-
-void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
-  // Support dimensions up to 4
-  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
-  // of input and output are the same
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
-  idx = 0;
-  add_4D_tensor_argument(idx, _input, slice_in);
-  add_4D_tensor_argument(idx, _output, slice_out);
-  enqueue(queue, *this, slice_out);
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
deleted file mode 100644
index 6b0697e89..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
-                          const ITensorInfo *padding_size, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
-                                  "The number of dimensions of input should be equal to output");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
-                                  "The input and output layouts are different!");
-
-  // TODO Support other cases
-  if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
-                                    "Input Depth should be equal to Output Depth");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
-                                        padding_size->dimension(1) != 2,
-                                    "Only 2-dimensional spatial block's size was wrong");
-  }
-  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
-                                    "Input Depth should be equal to Output Depth");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
-                                        padding_size->dimension(1) != 2,
-                                    "Only 2-dimensional spatial block's size was wrong");
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
-                                  "CLSpaceToBatchNDKernel supports dimensions up to 4");
-
-  if (input->data_type() == DataType::QASYMM8)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
-                                    "The input and output quantization info are different!");
-  }
-
-  return Status{};
-}
-
-} // namespace
-
-CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
-                                       const ICLTensor *padding_size, ICLTensor *output)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
-
-  _input = input;
-  _block_size = block_size;
-  _padding_size = padding_size;
-  _output = output;
-
-  // Set kernel build options
-  // TODO Support other cases
-  std::string kernel_name = "space_to_batch_4d";
-  std::set<std::string> build_opts;
-  Window win;
-
-  if (input->info()->data_layout() == DataLayout::NCHW)
-  {
-    kernel_name += "_nchw";
-    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
-
-    win = calculate_max_window(*output->info(), Steps());
-
-    Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-  }
-  else if (input->info()->data_layout() == DataLayout::NHWC)
-  {
-    kernel_name += "_nhwc";
-    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DVEC_SIZE=" +
-                       support::cpp11::to_string(num_elems_processed_per_iteration));
-
-    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-    input_access.set_valid_region(win, output->info()->valid_region());
-
-    if (window_changed)
-    {
-      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
-    }
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Unsupported layout");
-  }
-
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
-  if (input->info()->data_type() == DataType::QASYMM8)
-  {
-    build_opts.emplace("-DZERO_VALUE=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-  }
-  else
-  {
-    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
-  }
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure kernel window
-  ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-#if defined(DEBUG)
-  const_cast<ICLTensor *>(_block_size)->map(queue);
-  const_cast<ICLTensor *>(_padding_size)->map(queue);
-
-  const size_t num_dimensions = _input->info()->num_dimensions();
-  const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
-  int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
-  for (size_t i = 0; i < num_spacial_dimensions; ++i)
-  {
-    const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
-    const int32_t padding_size_pre =
-        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
-    const int32_t padding_size_post =
-        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
-
-    ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
-    ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
-                             "Padding size should be greater than or equal to 0");
-
-    if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
-    {
-      ARM_COMPUTE_ERROR_ON_MSG(
-          _output->info()->dimension(i) !=
-              (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
-          "Dimension value of spatial block does not match output's dimension value");
-    }
-    else
-    {
-      ARM_COMPUTE_ERROR_ON_MSG(
-          _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
-              (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
-               padding_size_pre + padding_size_post) /
-                  block_size,
-          "Dimension value of spatial block does not match output's dimension value");
-    }
-
-    batch_size *= block_size;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(
-      _output->info()->dimension(num_dimensions - 1) != batch_size,
-      "Output batch size should be equal to input batch size * (multiplication of all block size)");
-
-  const_cast<ICLTensor *>(_block_size)->unmap(queue);
-  const_cast<ICLTensor *>(_padding_size)->unmap(queue);
-#endif // defined(DEBUG)
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Set block size window
-  Window win_block = calculate_max_window(*_block_size->info(), Steps());
-
-  // Set padding size window
-  Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    add_1D_tensor_argument(idx, _block_size, win_block);
-    add_2D_tensor_argument(idx, _padding_size, win_padding);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 5d6329edc..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
-                                  "Input batch should be equal to Output batch");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input->dimension(2) * block_size * block_size == output->dimension(2),
-      "Output depth should be equal to (input depth * block size *block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
-                                      !(input->dimension(1) % block_size),
-                                  "Input height and width should be divisible by block size");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
-                                      (output->dimension(1) == (input->dimension(1) / block_size)),
-                                  "Output height and width should be equal to "
-                                  "input_height/blocksize and input_width/blocksize respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
deleted file mode 100644
index 260bc39f1..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                          ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input1->info()->data_type() == DataType::F16 &&
-        input2->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input1->info()->data_type() == DataType::F32 ||
-             input2->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLSquaredDifferenceKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
deleted file mode 100644
index 48146a43a..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-
-using namespace arm_compute;
-
-CLStridedSliceExKernel::CLStridedSliceExKernel()
-    : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
-      _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
-{
-}
-
-Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                        const ITensorInfo *begin, const ITensorInfo *end,
-                                        const ITensorInfo *strides, int32_t beginMask,
-                                        int32_t endMask, int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(),
-                                              strides->tensor_shape());
-
-  return Status{};
-}
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride,
-                            const TensorShape &inputShape, int32_t axis)
-{
-  // Begin with the specified index
-  int32_t start = begin;
-
-  // beginMask override
-  if (beginMask & 1 << axis)
-  {
-    if (stride > 0)
-    {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axisSize-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int32_t>::lowest();
-    }
-    else
-    {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int32_t>::max();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (start < 0)
-  {
-    start += axisSize;
-  }
-
-  // Clamping
-  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
-  return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
-                           const TensorShape &inputShape, int32_t axis)
-{
-  // Begin with the specified index
-  int32_t stop = end;
-
-  // endMask override
-  if (endMask & (1 << axis))
-  {
-    if (stride > 0)
-    {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int32_t>::max();
-    }
-    else
-    {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int32_t>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (stop < 0)
-  {
-    stop += axisSize;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (stride > 0)
-  {
-    // Forward iteration
-    stop = arm_compute::utility::clamp(stop, 0, axisSize);
-  }
-  else
-  {
-    // Backward iteration
-    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
-  }
-
-  return stop;
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
-  int32_t ret = 0;
-  if (stride > 0)
-  {
-    ret = ((stop - start - 1) / stride) + 1;
-  }
-  else
-  {
-    ret = ((stop - start + 1) / stride) + 1;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
-  return ret;
-}
-
-void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                       ICLTensor *beginData, ICLTensor *endData,
-                                       ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                                       int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
-                                      endData->info(), stridesData->info(), beginMask, endMask,
-                                      shrinkAxisMask));
-
-  _input = input;
-  _output = output;
-  _beginData = beginData;
-  _endData = endData;
-  _stridesData = stridesData;
-  _beginMask = beginMask;
-  _endMask = endMask;
-  _shrinkAxisMask = shrinkAxisMask;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DELEMENT_DATA_TYPE=" +
-                     get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-  ICLKernel::configure_internal(win);
-}
-
-void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  _beginData->map(queue);
-  _endData->map(queue);
-  _stridesData->map(queue);
-
-  std::vector<int32_t> starts;
-  std::vector<int32_t> strides;
-
-  for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
-  {
-    const TensorShape shape = _input->info()->tensor_shape();
-    starts.emplace_back(
-        StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
-                     reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
-
-    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
-  }
-
-  for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
-  {
-    starts.emplace_back(0);
-    strides.emplace_back(1);
-  }
-  // TODO: Apply shrinkAxisMask
-
-  _beginData->unmap(queue);
-  _stridesData->unmap(queue);
-  _endData->unmap(queue);
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-  const cl_int4 startsArg = {{
-      static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
-      static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, startsArg);
-
-  const cl_int4 stridesArg = {{
-      static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]),
-      static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, stridesArg);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
deleted file mode 100644
index 073c2f7bb..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-namespace arm_compute
-{
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
-
-void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
-                               cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  _input = input;
-  _topk_values = topk_values;
-  _topk_indices = topk_indices;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
-
-  unsigned int idx = 3 * num_arguments_per_1D_tensor();
-  _kernel.setArg(idx++, *indices);
-  _kernel.setArg(idx++, *temp_stack);
-  _kernel.setArg<cl_int>(idx++, k);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, 1, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _input, window);
-  add_1D_tensor_argument(idx, _topk_values, window);
-  add_1D_tensor_argument(idx, _topk_indices, window);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
-
-void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
-                             int n)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  _input = input;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
-
-  unsigned int idx = num_arguments_per_1D_tensor();
-  _kernel.setArg(idx++, *in_key_buf);
-  _kernel.setArg(idx++, *in_ind_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _input, window);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// This kernel makes a histogram of radix for each work item.
-CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
-
-void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
-
-  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
-
-  unsigned int idx = 1;
-  _kernel.setArg(idx++, *hist_buf);
-
-  idx = 3;
-  _kernel.setArg(idx++, loc_histo_size, nullptr);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  _kernel.setArg(0, *_in_key_buf);
-  _kernel.setArg<cl_int>(2, _pass);
-
-  cl::NDRange lws = cl::NDRange(_ITEMS, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
-
-void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
-  int temp_size =
-      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *hist_buf);
-  _kernel.setArg(idx++, temp_size, nullptr);
-  _kernel.setArg(idx++, *glob_sum_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
-
-void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
-                                               int bits)
-{
-  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
-  int temp_size =
-      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *glob_sum_buf);
-  _kernel.setArg(idx++, temp_size, nullptr);
-  _kernel.setArg(idx++, *temp_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
-
-void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *hist_buf);
-  _kernel.setArg(idx++, *glob_sum_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortReorder::CLRadixSortReorder()
-    : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
-      _out_ind_buf(nullptr)
-{
-}
-
-void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
-
-  unsigned int idx = 2;
-  _kernel.setArg(idx++, *hist_buf);
-
-  idx = 6;
-  _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
-  cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
-
-  _kernel.setArg(0, *_in_key_buf);
-  _kernel.setArg(1, *_out_key_buf);
-  _kernel.setArg<cl_int>(3, _pass);
-  _kernel.setArg(4, *_in_ind_buf);
-  _kernel.setArg(5, *_out_ind_buf);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
-
-void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
-  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
-
-  unsigned int idx = 1;
-  _kernel.setArg(idx++, *first_negative_idx_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *_out_key_buf);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
-    : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
-  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
-
-  unsigned int idx = 4;
-  _kernel.setArg(idx++, *first_negative_idx_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *_in_key_buf);
-  _kernel.setArg(idx++, *_out_key_buf);
-  _kernel.setArg(idx++, *_in_ind_buf);
-  _kernel.setArg(idx++, *_out_ind_buf);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Store::CLTopKV2Store()
-    : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
-{
-  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(k == 0);
-  ARM_COMPUTE_ERROR_ON(k > n);
-
-  _values = values;
-  _indices = indices;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
-
-  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, k, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
-{
-  _out_key_buf = out_key_buf;
-  _out_ind_buf = out_ind_buf;
-}
-
-void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _values, window);
-  add_1D_tensor_argument(idx, _indices, window);
-  _kernel.setArg(idx++, *_out_key_buf);
-  _kernel.setArg(idx++, *_out_ind_buf);
-
-  enqueue(queue, *this, window);
-}
-
-} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp
deleted file mode 100644
index 3b5782c25..000000000
--- a/libs/ARMComputeEx/src/core/NEON/kernels/NENormalizationLayerExKernel.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerExKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEMath.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared,
-                          const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
-
-  // Checks performed when output is configured
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
-                                                        ITensorInfo *input_squared,
-                                                        ITensorInfo *output,
-                                                        const NormalizationLayerInfo &norm_info)
-{
-  unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-  const unsigned int num_elems_read_per_iteration =
-      num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-  const unsigned int num_rows =
-      (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
-  const unsigned int border_width =
-      (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
-  BorderSize border_size = BorderSize(0, border_width);
-  bool window_changed = false;
-
-  // Configure window
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-  AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration,
-                                     num_rows);
-  AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0,
-                                             num_elems_read_per_iteration, num_rows);
-
-  if (output->total_size() != 0)
-  {
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed =
-        update_window_and_padding(win, input_access, input_squared_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-  }
-  else
-  {
-    window_changed = update_window_and_padding(win, input_access, input_squared_access);
-  }
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-NENormalizationLayerExKernel::NENormalizationLayerExKernel()
-    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr),
-      _norm_info(NormType::IN_MAP_1D), _border_size()
-{
-}
-
-BorderSize NENormalizationLayerExKernel::border_size() const { return _border_size; }
-
-void NENormalizationLayerExKernel::configure(const ITensor *input, const ITensor *input_squared,
-                                             ITensor *output, NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output->info(), *input->info());
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
-
-  const unsigned int border_width =
-      (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
-
-  _input = input;
-  _input_squared = input_squared;
-  _output = output;
-  _norm_info = norm_info;
-  _border_size = BorderSize(0, border_width);
-
-  switch (_input->info()->data_type())
-  {
-    case DataType::F32:
-    {
-      switch (norm_info.type())
-      {
-        case NormType::IN_MAP_1D:
-          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, false>;
-          break;
-        case NormType::IN_MAP_2D:
-          // Normalize over X and Y
-          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 0, true>;
-          break;
-        case NormType::CROSS_MAP:
-          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F32, 2, false>;
-          break;
-        default:
-          break;
-      }
-      break;
-    }
-    case DataType::F16:
-    {
-      switch (norm_info.type())
-      {
-        case NormType::IN_MAP_1D:
-          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, false>;
-          break;
-        case NormType::IN_MAP_2D:
-          // Normalize over X and Y
-          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 0, true>;
-          break;
-        case NormType::CROSS_MAP:
-          _func = &NENormalizationLayerExKernel::normalize_float<DataType::F16, 2, false>;
-          break;
-        default:
-          break;
-      }
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("NOT SUPPORTED!");
-  }
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), input_squared->info(),
-                                                  output->info(), norm_info);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  INEKernel::configure(win_config.second);
-}
-
-template <DataType dt, unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerExKernel::normalize_float(const Window &window)
-{
-  Iterator input(_input, window);
-  Iterator input_squared(_input_squared, window);
-  Iterator output(_output, window);
-
-  const int dim_y = 1;
-  const int radius = _norm_info.norm_size();
-  const int total_size = _input->info()->dimension(dim) - 1;
-  const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
-  // We account padding across X only and we iterate over rows
-  const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
-  const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
-  const int min_top = 0;
-  const int max_bottom = _input->info()->dimension(dim_y) - 1;
-
-  if (dt == DataType::F32)
-  {
-    const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
-    const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
-    const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
-
-    execute_window_loop(
-        window,
-        [&](const Coordinates &id) {
-          // Get range to normalize
-          const int current_row = do_2D_norm ? id[dim_y] : 0;
-          const int current_slice = id[dim];
-          const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-          const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-          const int first_slice = std::max(current_slice - radius, min_left);
-          const int last_slice = std::min(current_slice + radius, max_right);
-
-          // Accumulate 2D In-Map values
-          float32x4_t accu = vdupq_n_f32(0.f);
-          for (int j = first_row; j <= last_row; j++)
-          {
-            // Compute row displacement
-            const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr =
-                input_squared.ptr() + row - (current_slice * input_squared_stride);
-            for (int i = first_slice; i <= last_slice; ++i)
-            {
-              accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(
-                                         input_squared_ptr + i * input_squared_stride)));
-            }
-          }
-
-          // Normalize
-          const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
-          const float32x4_t normalized_pixel = vmulq_f32(
-              vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
-          vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
-        },
-        input, input_squared, output);
-  }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  else if (dt == DataType::F16)
-  {
-    const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff());
-    const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
-    const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa());
-
-    execute_window_loop(
-        window,
-        [&](const Coordinates &id) {
-          // Get range to normalize
-          const int current_row = do_2D_norm ? id[dim_y] : 0;
-          const int current_slice = id[dim];
-          const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-          const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-          const int first_slice = std::max(current_slice - radius, min_left);
-          const int last_slice = std::min(current_slice + radius, max_right);
-
-          // Accumulate 2D In-Map values
-          float16x8_t accu = vdupq_n_f16(0.f);
-          for (int j = first_row; j <= last_row; j++)
-          {
-            // Compute row displacement
-            const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr =
-                input_squared.ptr() + row - (current_slice * input_squared_stride);
-            for (int i = first_slice; i <= last_slice; ++i)
-            {
-              accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(
-                                         input_squared_ptr + i * input_squared_stride)));
-            }
-          }
-
-          const float16x8_t norm_f16 =
-              vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
-          const float16x8_t normalized_pixel = vmulq_f16(
-              vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
-          vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
-        },
-        input, input_squared, output);
-  }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-  else
-  {
-    ARM_COMPUTE_ERROR("Not supported");
-  }
-}
-
-Status NENormalizationLayerExKernel::validate(const ITensorInfo *input,
-                                              const ITensorInfo *input_squared,
-                                              const ITensorInfo *output,
-                                              const NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                            input_squared->clone().get(),
-                                                            output->clone().get(), norm_info)
-                                  .first);
-
-  return Status{};
-}
-
-void NENormalizationLayerExKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-  ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-  // Run function
-  (this->*_func)(window);
-}
diff --git a/libs/ARMComputeEx/src/core/UtilsEx.cpp b/libs/ARMComputeEx/src/core/UtilsEx.cpp
deleted file mode 100644
index b63093bbb..000000000
--- a/libs/ARMComputeEx/src/core/UtilsEx.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/UtilsEx.h"
-
-#include <cstdint>
-#include <fstream>
-#include <map>
-#include <string>
-
-using namespace arm_compute;
-
-const std::string &
-arm_compute::string_from_activation_func_ex(ActivationLayerInfoEx::ActivationFunction act)
-{
-  static std::map<ActivationLayerInfoEx::ActivationFunction, const std::string> act_map = {
-      {ActivationLayerInfoEx::ActivationFunction::RSQRT, "RSQRT"},
-  };
-
-  return act_map[act];
-}