24 files changed, 0 insertions, 4214 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
deleted file mode 100644
index 1fdd2f98f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/UtilsEx.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ActivationLayerInfoEx &act_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                       DataType::F16, DataType::F32);
-
-  // Checks performed when output is configured
-  if ((output != nullptr) && (output->total_size() != 0))
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  if (output != nullptr)
-  {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, *input);
-  }
-
-  const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-  bool window_changed = false;
-
-  if (output != nullptr)
-  {
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-  }
-  else
-  {
-    window_changed = update_window_and_padding(
-        win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
-  }
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLActivationLayerExKernel::CLActivationLayerExKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
-{
-}
-
-void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
-                                          ActivationLayerInfoEx act_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _run_in_place = (output == nullptr) || (output == input);
-
-  if (output != nullptr)
-  {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-  }
-
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
-
-  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-  const DataType dt = input->info()->data_type();
-  float a_const = act_info.a();
-  float b_const = act_info.b();
-  int a_const_int = 0;
-  int b_const_int = 0;
-
-  // Create quantized version of constants a, b if needed
-  if (is_data_type_quantized(dt))
-  {
-    a_const_int =
-        input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
-    b_const_int =
-        input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
-  }
-
-  // Set build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(
-      ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized(dt))
-  {
-    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
-    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
-
-    const int o1 = input->info()->quantization_info().offset;
-    // Quantized value of 0 corresponds to the offset o1
-    build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
-
-    // Set scale and offset of the input and output if they have different quantization info
-    if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
-    {
-      const float s1 = input->info()->quantization_info().scale;
-      const float s2 = output->info()->quantization_info().scale;
-      const int o2 = output->info()->quantization_info().offset;
-
-      if (o1 != o2 || s1 != s2)
-      {
-        build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
-        build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
-        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
-        build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
-      }
-    }
-  }
-  else
-  {
-    build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-    build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-  }
-
-  build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
-
-  // Create kernel
-  std::string kernel_name = std::string("activation_layer_ex");
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Make sure _kernel is initialized before calling the parent's configure
-  _input = input;
-  _output = output;
-
-  // Configure kernel window
-  auto win_config =
-      validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Set config_id for enabling LWS tuning
-  _config_id = "activation_layer_ex_";
-  _config_id += lower_string(string_from_data_type(dt));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(0));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ActivationLayerInfoEx &act_info)
-{
-  const bool run_in_place = (output == nullptr) || (output == input);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(),
-                                    (run_in_place) ? nullptr : output->clone().get())
-          .first);
-
-  return Status{};
-}
-
-void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    if (!_run_in_place)
-    {
-      add_3D_tensor_argument(idx, _output, slice);
-    }
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
deleted file mode 100644
index c1a2ad0be..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(argminmax_axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const uint32_t argminmax_axis, ArgOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
-                                                       DataType::U8);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match argminmax_axis");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      argminmax_axis >= 0 && argminmax_axis < num_dimensions,
-      "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
-  return Status{};
-}
-
-} // namespace
-
-CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
-
-void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                  const uint32_t argminmax_axis, ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
-
-  _input = input;
-  _output = output;
-  _argminmax_axis = argminmax_axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
-
-  // Construct kernel name for argmax and argmin based on axis
-  std::string kernel_name = "arg_op";
-  int op_code = 0;
-  if (op == ArgOperation::MAX)
-  {
-    op_code = 1;
-  }
-  else if (op == ArgOperation::MIN)
-  {
-    op_code = 2;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                   const uint32_t argminmax_axis, ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
-
-  return Status{};
-}
-
-void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _argminmax_axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
deleted file mode 100644
index 1c505b4d5..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output, ConvertPolicy policy)
-{
-  ARM_COMPUTE_UNUSED(policy);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        output->data_type() == DataType::U8 &&
-            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-        "Output can only be U8 if both inputs are U8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
-                                                        ITensorInfo *output)
-{
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output, out_shape);
-
-    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output, Format::S16);
-    }
-    else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output, Format::F16);
-    }
-    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output, Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                                ICLTensor *output, ConvertPolicy policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input1->info(), input2->info(), output->info(), policy));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  const bool has_float_out = is_data_type_float(output->info()->data_type());
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
-
-  ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
-                                                 const ITensorInfo *input2,
-                                                 const ITensorInfo *output, ConvertPolicy policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
-                                                            input2->clone().get(),
-                                                            output->clone().get())
-                                  .first);
-
-  return Status{};
-}
-
-void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticSubtractionExKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
deleted file mode 100644
index b0016d23c..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t *block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
-                                  "Input Depth should be equal to Output Depth");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
-      "Input batch should be equal to (output batch * block size[0] *block size[1])");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
-                                      !(output->dimension(1) % block_size[0]),
-                                  "Output height and width should be divisible by block size[0] "
-                                  "and block_size[1] respectively");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
-                                      (output->dimension(1) == input->dimension(1) * block_size[0]),
-                                  "Output height and width should be equal to "
-                                  "input_height*blocksize[0] and input_width*blocksize[1] "
-                                  "respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                       const int32_t *block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
-  build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
-  build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_out);
-    add_4D_tensor_argument(idx, _output, slice_in);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
deleted file mode 100644
index 3d2f2c702..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
-                                                         DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                        ICLTensor *output, BinaryLogicalOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "binary_logical_op";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
-
-  int op_code = 0;
-  switch (op)
-  {
-    case BinaryLogicalOperation::AND:
-      op_code = 1;
-      break;
-    case BinaryLogicalOperation::OR:
-      op_code = 2;
-      break;
-    default:
-      throw std::runtime_error("Operation not supported, yet");
-  }
-
-  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLBinaryLogicalOpKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index bf7ebae3f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  // Create kernel
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    const float scale_in = input->info()->quantization_info().scale;
-    const int offset_in = input->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
-  }
-  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
-  {
-    const float scale_in = output->info()->quantization_info().scale;
-    const int offset_in = output->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
-  }
-  else
-  {
-    _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts));
-  }
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
deleted file mode 100644
index 5af5b16ea..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32, DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32, DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                     ICLTensor *output, const ComparisonOperation &op)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "comparison_op";
-  int op_code = 0;
-
-  switch (op)
-  {
-    case ComparisonOperation::EQUAL:
-      op_code = 1;
-      break;
-    case ComparisonOperation::NOT_EQUAL:
-      op_code = 2;
-      break;
-    default:
-      throw std::runtime_error(" Operation not supported, yet");
-  }
-
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
-  build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
-  build_opts.emplace(
-      ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
-      ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
-       (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
-  {
-    build_opts.emplace("-DOFFSET_IN1=" +
-                       support::cpp11::to_string(input1->info()->quantization_info().offset));
-    build_opts.emplace("-DOFFSET_IN2=" +
-                       support::cpp11::to_string(input2->info()->quantization_info().offset));
-    build_opts.emplace("-DSCALE_IN1=" +
-                       support::cpp11::to_string(input1->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_IN2=" +
-                       support::cpp11::to_string(input2->info()->quantization_info().scale));
-    kernel_name += "_qasymm8";
-  }
-
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input1->info()->data_type() == DataType::S16 ||
-        input2->info()->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output->info(), Format::S16);
-    }
-    else if (input1->info()->data_type() == DataType::F16 &&
-             input2->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input1->info()->data_type() == DataType::F32 ||
-             input2->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLComparisonOpKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index c386e3312..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
-                                  "Output width should be equal to (Input width * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
-                                  "Output height should be equal to (Input height * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
-                                  "Input depth should be divisible by (block size * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(2) == input->dimension(2) / (block_size * block_size),
-      "Output depth should be equal to (Input depth / (block size * block size))");
-
-  return Status{};
-}
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
-  // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
deleted file mode 100644
index 0862b78bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  input_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *lookups)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-
-  return Status{};
-}
-
-void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                        const ICLTensor *lookups)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
-  _input = input;
-  _output = output;
-  _lookups = lookups;
-
-  // Set kernel build options
-  std::stringstream kernel_name;
-  std::set<std::string> build_opts;
-  kernel_name << "embedding_lookup";
-
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-}
-
-void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  Window win_lookup;
-  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_in);
-    add_1D_tensor_argument(idx, _lookups, win_lookup);
-
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
deleted file mode 100644
index b1ee21bdc..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLExpKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Auto initialize output
-  auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
-                     input->info()->quantization_info());
-
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
deleted file mode 100644
index ae2801e2b..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
-                                                       DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-
-  return Status{};
-}
-
-} // namespace
-
-CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {}
-
-void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Construct kernel name
-  std::string kernel_name = "gather";
-  if (input1->info()->num_dimensions() == 1)
-  {
-    kernel_name = "gather_1d";
-  }
-  else if (input1->info()->num_dimensions() == 2)
-  {
-    if (_output->info()->num_dimensions() == 1)
-    {
-      kernel_name = "gather_1d_out";
-    }
-  }
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
-  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
-
-  return Status{};
-}
-
-void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  if (_input1->info()->num_dimensions() == 1)
-  {
-    Window slice = window.first_slice_window_1D();
-
-    unsigned int idx = 0;
-    add_1D_tensor_argument(idx, _input1, slice);
-    add_1D_tensor_argument(idx, _input2, slice);
-    add_1D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  }
-  else if (_input1->info()->num_dimensions() == 2)
-  {
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
-
-    // Set inputs
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input1, window_collapsed);
-    add_1D_tensor_argument(idx, _input2, slice);
-    if (_output->info()->num_dimensions() == 1)
-    {
-      add_1D_tensor_argument(idx, _output, slice);
-    }
-    else
-    {
-      add_2D_tensor_argument(idx, _output, window_collapsed);
-    }
-    enqueue(queue, *this, slice);
-  }
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
deleted file mode 100644
index cd7b21c6d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  input_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLHashtableLookupKernel::CLHashtableLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
-{
-}
-
-Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
-                                         const ITensorInfo *input, const ITensorInfo *output,
-                                         const ITensorInfo *hits)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Output's shape was not set");
-
-  ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
-                       output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
-  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
-  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
-
-  return Status{};
-}
-
-void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
-                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
-
-  _lookups = lookups;
-  _keys = keys;
-  _input = input;
-  _output = output;
-  _hits = hits;
-
-  // Make _lookup_indices tensor
-  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
-  _lookup_indices->allocator()->init(
-      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
-  _lookup_indices->allocator()->allocate();
-
-  // Set kernel build options
-  std::stringstream kernel_name;
-  std::set<std::string> build_opts;
-  kernel_name << "hashtable_lookup";
-
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-}
-
-void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const_cast<ICLTensor *>(_lookups)->map(queue);
-  const_cast<ICLTensor *>(_keys)->map(queue);
-  _hits->map(queue);
-  _lookup_indices->map(queue);
-
-  // Set values of hits
-  const int32_t *lookups_buf =
-      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
-  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
-  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
-  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
-
-  std::map<int32_t, size_t> key_map;
-  const size_t keys_num = _keys->info()->dimension(0);
-  for (size_t key_index = 0; key_index < keys_num; key_index++)
-  {
-    key_map[keys_buf[key_index]] = key_index;
-  }
-
-  const size_t lookups_num = _lookups->info()->dimension(0);
-  for (size_t i = 0; i < lookups_num; ++i)
-  {
-    const auto lookup_value = lookups_buf[i];
-    const auto it = key_map.find(lookup_value);
-    if (it != key_map.end())
-    {
-#if defined(DEBUG)
-      if (it->second >= lookups_num)
-        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
-#endif // defined(DEBUG)
-      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
-      hits_buf[i] = static_cast<uint8_t>(1);
-    }
-    else
-    {
-      lookup_indices_buf[i] = -1;
-      hits_buf[i] = static_cast<uint8_t>(0);
-    }
-  }
-
-  const_cast<ICLTensor *>(_lookups)->unmap(queue);
-  const_cast<ICLTensor *>(_keys)->unmap(queue);
-  _hits->unmap(queue);
-  _lookup_indices->unmap(queue);
-
-  Window win = window.collapse(ICLKernel::window(), 2, 4);
-
-  Window win_lookup;
-  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, win);
-    add_4D_tensor_argument(idx, _output, win);
-    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
-
-    enqueue(queue, *this, win);
-  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
deleted file mode 100644
index 80d99dd3b..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNegKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
-                                                DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
-                                                DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
-                                              output->info()->tensor_shape());
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  return Status{};
-}
-
-} // namespace
-
-CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
-
-  // Configure window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
deleted file mode 100644
index 12bbe910f..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-  // Checks performed when output is configured
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                        NormalizationLayerInfo norm_info)
-{
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output, *input->clone());
-
-  const unsigned int norm_size = norm_info.norm_size();
-  bool is_in_map = norm_info.is_in_map();
-
-  const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
-  const BorderSize border_size = BorderSize(0, border_width);
-
-  const unsigned int num_elems_processed_per_iteration = 4;
-  const unsigned int num_elems_read_per_iteration =
-      is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
-                : num_elems_processed_per_iteration;
-
-  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-  // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
-  // the kernel, avoiding padding
-  AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-  output_access.set_valid_region(win, input->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
-    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
-{
-}
-
-BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
-
-void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                             NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output->info(), *input->info()->clone());
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
-
-  _input = input;
-  _output = output;
-
-  const unsigned int num_elems_processed_per_iteration = 4;
-  const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
-
-  // Set build options
-  CLBuildOptions build_opts;
-  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.add_option(
-      ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
-  build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
-  build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
-  build_opts.add_option(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
-  build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
-  build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-
-  // Create kernel
-  std::string kernel_name =
-      _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Set config_id for enabling LWS tuning
-  _config_id = "normalization_layer_";
-  _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(
-      static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(norm_info.norm_size());
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(0));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              NormalizationLayerInfo norm_info)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
-
-  return Status{};
-}
-
-void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
-  Window slice = window_collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-  } while (window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 241f8ae4d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
-
-  _input = input;
-  _alpha = alpha;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "prelu";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    build_opts.emplace("-DOFF_IN1=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-    build_opts.emplace("-DOFF_IN2=" +
-                       support::cpp11::to_string(alpha->info()->quantization_info().offset));
-    build_opts.emplace("-DOFF_OUT=" +
-                       support::cpp11::to_string(output->info()->quantization_info().offset));
-    build_opts.emplace("-DSCALE_IN1=" +
-                       support::cpp11::to_string(input->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_IN2=" +
-                       support::cpp11::to_string(alpha->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_OUT=" +
-                       support::cpp11::to_string(output->info()->quantization_info().scale));
-    kernel_name += "_qasymm8";
-  }
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input->info()->data_type() == DataType::F32 ||
-             alpha->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
-  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input->info()->tensor_shape();
-  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice_input1);
-    add_3D_tensor_argument(idx, _alpha, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
deleted file mode 100644
index 99b54c822..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
-                          const ITensorInfo *pad_size_info)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
-                                      input_info->num_dimensions() <= 4,
-                                  "Pad kernel supports upto 4-D input tensor");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input_info->num_dimensions() == output_info->num_dimensions(),
-      "output tensor should have same number of dimensions as input tensor");
-
-  if (input_info->data_type() == DataType::QASYMM8)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
-                                        output_info->quantization_info(),
-                                    "The input and output quantization info are different!");
-  }
-
-  return Status{};
-}
-
-} // namespace
-
-CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
-
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
-
-  _input = input;
-  _output = output;
-  _pad_size = pad_size;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
-  build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
-  build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
-  build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
-  if (input->info()->data_type() == DataType::QASYMM8)
-  {
-    build_opts.emplace("-DZERO_VALUE=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-  }
-  else
-  {
-    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
-  }
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  _pad_size->map(queue);
-
-  // Padding values only for up, top, left and front are required based on the rank of tensor
-  int rank = _pad_size->info()->dimension(1);
-
-  auto pad_batch_up =
-      (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
-  auto pad_height_top =
-      (rank >= 2)
-          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
-          : 0;
-  auto pad_width_left = (rank >= 1)
-                            ? *reinterpret_cast<const int32_t *>(
-                                  _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
-                            : 0;
-  auto pad_depth_front =
-      (rank >= 3)
-          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
-          : 0;
-
-  _pad_size->unmap(queue);
-
-  // Pad_values which needs to be passed
-  const cl_int4 paddingValues = {
-      {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
-       static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    _kernel.setArg<cl_int4>(idx++, paddingValues);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
deleted file mode 100644
index aa094761c..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-using namespace arm_compute;
-
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
-{
-  TensorShape output_shape = input->tensor_shape();
-  permute(output_shape, perm);
-  return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const PermutationVector &perm)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-
-  const TensorShape output_shape =
-      misc::shape_calculator::compute_permutation_output_shape(*input, perm);
-
-  // Validate configured output
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-  return Status{};
-}
-} // namespace
-
-CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
-
-void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                  const PermutationVector &perm)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
-
-  _input = input;
-  _output = output;
-  _perm = perm;
-
-  const TensorShape output_shape = get_output_shape(input->info(), perm);
-  // Output auto inizialitation if not yet initialized
-  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-  // Create kernel
-  std::set<std::string> build_opts;
-
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-  // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
-  build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
-  build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
-  build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
-  build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                   const PermutationVector &perm)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
-
-  return Status{};
-}
-
-void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
deleted file mode 100644
index b985aa737..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
-                          RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_UNUSED(overflow_policy);
-  ARM_COMPUTE_UNUSED(rounding_policy);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        output->data_type() == DataType::U8 &&
-            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-        "Output can only be U8 if both inputs are U8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
-                                                        ITensorInfo *output)
-{
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output, out_shape);
-
-    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-    {
-      set_format_if_unknown(*output, Format::S16);
-    }
-    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output, Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, win);
-}
-} // namespace
-
-CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                          ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy,
-                                          RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
-                                                scale, overflow_policy, rounding_policy));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  int scale_int = -1;
-  // Extract sign, exponent and mantissa
-  int exponent = 0;
-  float normalized_mantissa = std::frexp(scale, &exponent);
-  // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-  // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <=
-  // 14
-  // Moreover, it will be negative as we deal with 1/2^n
-  if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-  {
-    // Store the positive exponent. We know that we compute 1/2^n
-    // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-    scale_int = std::abs(exponent - 1);
-  }
-
-  std::string data_type;
-  std::string compute_type;
-  // Check if it has float inputs and output
-  if (is_data_type_float(input1->info()->data_type()) ||
-      is_data_type_float(input2->info()->data_type()))
-  {
-    scale_int = -1;
-    compute_type = (input1->info()->data_type() == DataType::F32 ||
-                    input2->info()->data_type() == DataType::F32)
-                       ? "float"
-                       : "half";
-    data_type = "DATA_TYPE_FLOAT";
-  }
-  else
-  {
-    if (input1->info()->data_type() == DataType::S16 ||
-        input2->info()->data_type() == DataType::S16)
-    {
-      compute_type = "int";
-    }
-    else
-    {
-      compute_type = "ushort";
-    }
-    data_type = "DATA_TYPE_INT";
-  }
-
-  // Construct kernel name
-  std::string kernel_name = "pixelwise_div";
-  kernel_name += (scale_int >= 0) ? "_int" : "_float";
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(
-      (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()))
-          ? "-DWRAP"
-          : "-DSATURATE");
-  build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
-                                                                  : "-DROUND=_rte");
-  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
-  build_opts.emplace("-D" + data_type);
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Set scale argument
-  unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters
-
-  if (scale_int >= 0)
-  {
-    _kernel.setArg(idx++, scale_int);
-  }
-  else
-  {
-    _kernel.setArg(idx++, scale);
-  }
-
-  ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                           const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy,
-                                           RoundingPolicy rounding_policy)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
-                                                            input2->clone().get(),
-                                                            output->clone().get())
-                                  .first);
-
-  return Status{};
-}
-
-void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPixelWiseDivisionKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
deleted file mode 100644
index f581780e1..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-namespace
-{
-// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
-// are the same.
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                          ReduceOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32, DataType::S32);
-  if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
-                                    "Not support QASYMM8, yet");
-  }
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      axis >= 0 && axis < num_dimensions,
-      "axis must be greater than or equal to 0 and less than (input's rank).");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match axis");
-
-  return Status{};
-}
-} // namespace
-
-CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                        const uint32_t axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
-  // Construct kernel name
-  std::string kernel_name;
-  int op_code = 0;
-  if (op == ReduceOperation::MAX)
-  {
-    kernel_name = "reduce_min_max";
-    op_code = 1;
-  }
-  else if (op == ReduceOperation::MIN)
-  {
-    kernel_name = "reduce_min_max";
-    op_code = 2;
-  }
-  else if (op == ReduceOperation::SUM)
-  {
-    kernel_name = "reduce_sum_mean";
-    op_code = 3;
-  }
-  else if (op == ReduceOperation::MEAN)
-  {
-    kernel_name = "reduce_sum_mean";
-    op_code = 4;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                         const uint32_t axis, ReduceOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
-  return Status{};
-}
-
-void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
-  // Support dimensions up to 4
-  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
-  // of input and output are the same
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
-  idx = 0;
-  add_4D_tensor_argument(idx, _input, slice_in);
-  add_4D_tensor_argument(idx, _output, slice_out);
-  enqueue(queue, *this, slice_out);
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
deleted file mode 100644
index 6b0697e89..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
-                          const ITensorInfo *padding_size, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                       DataType::S16, DataType::F16, DataType::S32,
-                                                       DataType::F32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
-                                  "The number of dimensions of input should be equal to output");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
-                                  "The input and output layouts are different!");
-
-  // TODO Support other cases
-  if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
-                                    "Input Depth should be equal to Output Depth");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
-                                        padding_size->dimension(1) != 2,
-                                    "Only 2-dimensional spatial block's size was wrong");
-  }
-  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
-                                    "Input Depth should be equal to Output Depth");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
-                                        padding_size->dimension(1) != 2,
-                                    "Only 2-dimensional spatial block's size was wrong");
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
-                                  "CLSpaceToBatchNDKernel supports dimensions up to 4");
-
-  if (input->data_type() == DataType::QASYMM8)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
-                                    "The input and output quantization info are different!");
-  }
-
-  return Status{};
-}
-
-} // namespace
-
-CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
-                                       const ICLTensor *padding_size, ICLTensor *output)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
-
-  _input = input;
-  _block_size = block_size;
-  _padding_size = padding_size;
-  _output = output;
-
-  // Set kernel build options
-  // TODO Support other cases
-  std::string kernel_name = "space_to_batch_4d";
-  std::set<std::string> build_opts;
-  Window win;
-
-  if (input->info()->data_layout() == DataLayout::NCHW)
-  {
-    kernel_name += "_nchw";
-    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
-
-    win = calculate_max_window(*output->info(), Steps());
-
-    Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-  }
-  else if (input->info()->data_layout() == DataLayout::NHWC)
-  {
-    kernel_name += "_nhwc";
-    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DVEC_SIZE=" +
-                       support::cpp11::to_string(num_elems_processed_per_iteration));
-
-    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-    input_access.set_valid_region(win, output->info()->valid_region());
-
-    if (window_changed)
-    {
-      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
-    }
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Unsupported layout");
-  }
-
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
-  if (input->info()->data_type() == DataType::QASYMM8)
-  {
-    build_opts.emplace("-DZERO_VALUE=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-  }
-  else
-  {
-    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
-  }
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure kernel window
-  ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-#if defined(DEBUG)
-  const_cast<ICLTensor *>(_block_size)->map(queue);
-  const_cast<ICLTensor *>(_padding_size)->map(queue);
-
-  const size_t num_dimensions = _input->info()->num_dimensions();
-  const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
-  int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
-  for (size_t i = 0; i < num_spacial_dimensions; ++i)
-  {
-    const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
-    const int32_t padding_size_pre =
-        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
-    const int32_t padding_size_post =
-        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
-
-    ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
-    ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
-                             "Padding size should be greater than or equal to 0");
-
-    if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
-    {
-      ARM_COMPUTE_ERROR_ON_MSG(
-          _output->info()->dimension(i) !=
-              (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
-          "Dimension value of spatial block does not match output's dimension value");
-    }
-    else
-    {
-      ARM_COMPUTE_ERROR_ON_MSG(
-          _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
-              (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
-               padding_size_pre + padding_size_post) /
-                  block_size,
-          "Dimension value of spatial block does not match output's dimension value");
-    }
-
-    batch_size *= block_size;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(
-      _output->info()->dimension(num_dimensions - 1) != batch_size,
-      "Output batch size should be equal to input batch size * (multiplication of all block size)");
-
-  const_cast<ICLTensor *>(_block_size)->unmap(queue);
-  const_cast<ICLTensor *>(_padding_size)->unmap(queue);
-#endif // defined(DEBUG)
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Set block size window
-  Window win_block = calculate_max_window(*_block_size->info(), Steps());
-
-  // Set padding size window
-  Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    add_1D_tensor_argument(idx, _block_size, win_block);
-    add_2D_tensor_argument(idx, _padding_size, win_padding);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 5d6329edc..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
-                                  "Input batch should be equal to Output batch");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input->dimension(2) * block_size * block_size == output->dimension(2),
-      "Output depth should be equal to (input depth * block size *block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
-                                      !(input->dimension(1) % block_size),
-                                  "Input height and width should be divisible by block size");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
-                                      (output->dimension(1) == (input->dimension(1) / block_size)),
-                                  "Output height and width should be equal to "
-                                  "input_height/blocksize and input_width/blocksize respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
deleted file mode 100644
index 260bc39f1..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
-                                          ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
-
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  // Create kernel
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input1->info()->data_type() == DataType::F16 &&
-        input2->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input1->info()->data_type() == DataType::F32 ||
-             input2->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
-
-  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
-                        update_window_and_padding(win_input2, input2_access) ||
-                        update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input1, slice_input1);
-    add_3D_tensor_argument(idx, _input2, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLSquaredDifferenceKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
deleted file mode 100644
index 48146a43a..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-
-using namespace arm_compute;
-
-CLStridedSliceExKernel::CLStridedSliceExKernel()
-    : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
-      _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
-{
-}
-
-Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                        const ITensorInfo *begin, const ITensorInfo *end,
-                                        const ITensorInfo *strides, int32_t beginMask,
-                                        int32_t endMask, int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-  ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(),
-                                              strides->tensor_shape());
-
-  return Status{};
-}
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride,
-                            const TensorShape &inputShape, int32_t axis)
-{
-  // Begin with the specified index
-  int32_t start = begin;
-
-  // beginMask override
-  if (beginMask & 1 << axis)
-  {
-    if (stride > 0)
-    {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axisSize-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int32_t>::lowest();
-    }
-    else
-    {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int32_t>::max();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (start < 0)
-  {
-    start += axisSize;
-  }
-
-  // Clamping
-  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
-  return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
-                           const TensorShape &inputShape, int32_t axis)
-{
-  // Begin with the specified index
-  int32_t stop = end;
-
-  // endMask override
-  if (endMask & (1 << axis))
-  {
-    if (stride > 0)
-    {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int32_t>::max();
-    }
-    else
-    {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int32_t>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (stop < 0)
-  {
-    stop += axisSize;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (stride > 0)
-  {
-    // Forward iteration
-    stop = arm_compute::utility::clamp(stop, 0, axisSize);
-  }
-  else
-  {
-    // Backward iteration
-    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
-  }
-
-  return stop;
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
-  int32_t ret = 0;
-  if (stride > 0)
-  {
-    ret = ((stop - start - 1) / stride) + 1;
-  }
-  else
-  {
-    ret = ((stop - start + 1) / stride) + 1;
-  }
-  ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number");
-  return ret;
-}
-
-void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                       ICLTensor *beginData, ICLTensor *endData,
-                                       ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                                       int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
-                                      endData->info(), stridesData->info(), beginMask, endMask,
-                                      shrinkAxisMask));
-
-  _input = input;
-  _output = output;
-  _beginData = beginData;
-  _endData = endData;
-  _stridesData = stridesData;
-  _beginMask = beginMask;
-  _endMask = endMask;
-  _shrinkAxisMask = shrinkAxisMask;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DELEMENT_DATA_TYPE=" +
-                     get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-  ICLKernel::configure_internal(win);
-}
-
-void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  _beginData->map(queue);
-  _endData->map(queue);
-  _stridesData->map(queue);
-
-  std::vector<int32_t> starts;
-  std::vector<int32_t> strides;
-
-  for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
-  {
-    const TensorShape shape = _input->info()->tensor_shape();
-    starts.emplace_back(
-        StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
-                     reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
-
-    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
-  }
-
-  for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
-  {
-    starts.emplace_back(0);
-    strides.emplace_back(1);
-  }
-  // TODO: Apply shrinkAxisMask
-
-  _beginData->unmap(queue);
-  _stridesData->unmap(queue);
-  _endData->unmap(queue);
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-  const cl_int4 startsArg = {{
-      static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
-      static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, startsArg);
-
-  const cl_int4 stridesArg = {{
-      static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]),
-      static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, stridesArg);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
deleted file mode 100644
index 073c2f7bb..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-namespace arm_compute
-{
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {}
-
-void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices,
-                               cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  _input = input;
-  _topk_values = topk_values;
-  _topk_indices = topk_indices;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts));
-
-  unsigned int idx = 3 * num_arguments_per_1D_tensor();
-  _kernel.setArg(idx++, *indices);
-  _kernel.setArg(idx++, *temp_stack);
-  _kernel.setArg<cl_int>(idx++, k);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, 1, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _input, window);
-  add_1D_tensor_argument(idx, _topk_values, window);
-  add_1D_tensor_argument(idx, _topk_indices, window);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {}
-
-void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf,
-                             int n)
-{
-  ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  _input = input;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts));
-
-  unsigned int idx = num_arguments_per_1D_tensor();
-  _kernel.setArg(idx++, *in_key_buf);
-  _kernel.setArg(idx++, *in_ind_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _input, window);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// This kernel makes a histogram of radix for each work item.
-CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {}
-
-void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts));
-
-  int loc_histo_size = radix * _ITEMS * sizeof(cl_int);
-
-  unsigned int idx = 1;
-  _kernel.setArg(idx++, *hist_buf);
-
-  idx = 3;
-  _kernel.setArg(idx++, loc_histo_size, nullptr);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  _kernel.setArg(0, *_in_key_buf);
-  _kernel.setArg<cl_int>(2, _pass);
-
-  cl::NDRange lws = cl::NDRange(_ITEMS, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortScanHistogram::CLRadixSortScanHistogram() {}
-
-void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
-  int temp_size =
-      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *hist_buf);
-  _kernel.setArg(idx++, temp_size, nullptr);
-  _kernel.setArg(idx++, *glob_sum_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {}
-
-void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf,
-                                               int bits)
-{
-  ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts));
-
-  int temp_size =
-      std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *glob_sum_buf);
-  _kernel.setArg(idx++, temp_size, nullptr);
-  _kernel.setArg(idx++, *temp_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {}
-
-void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts));
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *hist_buf);
-  _kernel.setArg(idx++, *glob_sum_buf);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLRadixSortReorder::CLRadixSortReorder()
-    : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr),
-      _out_ind_buf(nullptr)
-{
-}
-
-void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
-{
-  ARM_COMPUTE_ERROR_ON(hist_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  unsigned int radix = 1 << bits;
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits));
-  build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix));
-  build_opts.emplace("-DPERMUT=1");
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts));
-
-  unsigned int idx = 2;
-  _kernel.setArg(idx++, *hist_buf);
-
-  idx = 6;
-  _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
-  unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT));
-  cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1);
-
-  _kernel.setArg(0, *_in_key_buf);
-  _kernel.setArg(1, *_out_key_buf);
-  _kernel.setArg<cl_int>(3, _pass);
-  _kernel.setArg(4, *_in_ind_buf);
-  _kernel.setArg(5, *_out_ind_buf);
-
-  enqueue(queue, *this, window, lws);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {}
-
-void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
-  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts));
-
-  unsigned int idx = 1;
-  _kernel.setArg(idx++, *first_negative_idx_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *_out_key_buf);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives()
-    : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n)
-{
-  ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr);
-  ARM_COMPUTE_ERROR_ON(n == 0);
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts));
-
-  unsigned int idx = 4;
-  _kernel.setArg(idx++, *first_negative_idx_buf);
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  _kernel.setArg(idx++, *_in_key_buf);
-  _kernel.setArg(idx++, *_out_key_buf);
-  _kernel.setArg(idx++, *_in_ind_buf);
-  _kernel.setArg(idx++, *_out_ind_buf);
-
-  enqueue(queue, *this, window);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CLTopKV2Store::CLTopKV2Store()
-    : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr)
-{
-}
-
-void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n)
-{
-  ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr);
-  ARM_COMPUTE_ERROR_ON(k == 0);
-  ARM_COMPUTE_ERROR_ON(k > n);
-
-  _values = values;
-  _indices = indices;
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts));
-
-  unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2;
-  _kernel.setArg<cl_int>(idx++, n);
-
-  // Configure kernel window
-  Window win;
-  win.set(0, Window::Dimension(0, k, 1));
-  ICLKernel::configure_internal(win);
-}
-
-void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)
-{
-  _out_key_buf = out_key_buf;
-  _out_ind_buf = out_ind_buf;
-}
-
-void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  unsigned int idx = 0;
-  add_1D_tensor_argument(idx, _values, window);
-  add_1D_tensor_argument(idx, _indices, window);
-  _kernel.setArg(idx++, *_out_key_buf);
-  _kernel.setArg(idx++, *_out_ind_buf);
-
-  enqueue(queue, *this, window);
-}
-
-} // namespace arm_compute