26 files changed, 3042 insertions, 507 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
new file mode 100644
index 000000000..1fdd2f98f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLActivationLayerExKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/UtilsEx.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ActivationLayerInfoEx &act_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::F16, DataType::F32);
+
+  // Checks performed when output is configured
+  if ((output != nullptr) && (output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  if (output != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, *input);
+  }
+
+  const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+  bool window_changed = false;
+
+  if (output != nullptr)
+  {
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+  }
+  else
+  {
+    window_changed = update_window_and_padding(
+        win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+  }
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLActivationLayerExKernel::CLActivationLayerExKernel()
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLActivationLayerExKernel::configure(ICLTensor *input, ICLTensor *output,
+                                          ActivationLayerInfoEx act_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _run_in_place = (output == nullptr) || (output == input);
+
+  if (output != nullptr)
+  {
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+  }
+
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
+
+  const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+  const DataType dt = input->info()->data_type();
+  float a_const = act_info.a();
+  float b_const = act_info.b();
+  int a_const_int = 0;
+  int b_const_int = 0;
+
+  // Create quantized version of constants a, b if needed
+  if (is_data_type_quantized(dt))
+  {
+    a_const_int =
+        input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
+    b_const_int =
+        input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
+  }
+
+  // Set build options
+  std::set<std::string> build_opts;
+  build_opts.emplace(
+      ("-DACT=" + lower_string(string_from_activation_func_ex(act_info.activation()))));
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized(dt))
+  {
+    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+
+    const int o1 = input->info()->quantization_info().offset;
+    // Quantized value of 0 corresponds to the offset o1
+    build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
+
+    // Set scale and offset of the input and output if they have different quantization info
+    if (is_data_type_quantized_asymmetric(dt) && output != nullptr)
+    {
+      const float s1 = input->info()->quantization_info().scale;
+      const float s2 = output->info()->quantization_info().scale;
+      const int o2 = output->info()->quantization_info().offset;
+
+      if (o1 != o2 || s1 != s2)
+      {
+        build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+        build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+        build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+      }
+    }
+  }
+  else
+  {
+    build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+    build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+  }
+
+  build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
+
+  // Create kernel
+  std::string kernel_name = std::string("activation_layer_ex");
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Make sure _kernel is initialized before calling the parent's configure
+  _input = input;
+  _output = output;
+
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set config_id for enabling LWS tuning
+  _config_id = "activation_layer_ex_";
+  _config_id += lower_string(string_from_data_type(dt));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(0));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLActivationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                           const ActivationLayerInfoEx &act_info)
+{
+  const bool run_in_place = (output == nullptr) || (output == input);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(),
+                                    (run_in_place) ? nullptr : output->clone().get())
+          .first);
+
+  return Status{};
+}
+
+void CLActivationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    if (!_run_in_place)
+    {
+      add_3D_tensor_argument(idx, _output, slice);
+    }
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
new file mode 100644
index 000000000..c1a2ad0be
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxKernel.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t argminmax_axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(argminmax_axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const uint32_t argminmax_axis, ArgOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F32,
+                                                       DataType::U8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), argminmax_axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match argminmax_axis");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      argminmax_axis >= 0 && argminmax_axis < num_dimensions,
+      "argminmax_axis must be greater than or equal to 0 and less than (input's rank).");
+  return Status{};
+}
+
+} // namespace
+
+CLArgMinMaxKernel::CLArgMinMaxKernel() : _input(nullptr), _output(nullptr), _argminmax_axis() {}
+
+void CLArgMinMaxKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                  const uint32_t argminmax_axis, ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), argminmax_axis));
+
+  _input = input;
+  _output = output;
+  _argminmax_axis = argminmax_axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), argminmax_axis));
+
+  // Construct kernel name for argmax and argmin based on axis
+  std::string kernel_name = "arg_op";
+  int op_code = 0;
+  if (op == ArgOperation::MAX)
+  {
+    op_code = 1;
+  }
+  else if (op == ArgOperation::MIN)
+  {
+    op_code = 2;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLArgMinMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const uint32_t argminmax_axis, ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, argminmax_axis, op));
+
+  return Status{};
+}
+
+void CLArgMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _argminmax_axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_argminmax_axis]);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _argminmax_axis));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
new file mode 100644
index 000000000..1c505b4d5
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLArithmeticSubtractionExKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output, ConvertPolicy policy)
+{
+  ARM_COMPUTE_UNUSED(policy);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
+
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        output->data_type() == DataType::U8 &&
+            (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+        "Output can only be U8 if both inputs are U8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2,
+                                                        ITensorInfo *output)
+{
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output, out_shape);
+
+    if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+    {
+      set_format_if_unknown(*output, Format::S16);
+    }
+    else if (input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output, Format::F16);
+    }
+    else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output, Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLArithmeticSubtractionExKernel::CLArithmeticSubtractionExKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionExKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                                ICLTensor *output, ConvertPolicy policy)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input1->info(), input2->info(), output->info(), policy));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+  build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+  build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("arithmetic_sub_ex", build_opts));
+
+  ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLArithmeticSubtractionExKernel::validate(const ITensorInfo *input1,
+                                                 const ITensorInfo *input2,
+                                                 const ITensorInfo *output, ConvertPolicy policy)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(),
+                                                            input2->clone().get(),
+                                                            output->clone().get())
+                                  .first);
+
+  return Status{};
+}
+
+void CLArithmeticSubtractionExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLArithmeticSubtractionExKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
new file mode 100644
index 000000000..b0016d23c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBatchToSpaceNDKernel.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t *block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size[0] >= 1 && block_size[1] >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) == output->dimension(2),
+                                  "Input Depth should be equal to Output Depth");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      output->dimension(3) * block_size[0] * block_size[1] == input->dimension(3),
+      "Input batch should be equal to (output batch * block size[0] *block size[1])");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(output->dimension(0) % block_size[1]) &&
+                                      !(output->dimension(1) % block_size[0]),
+                                  "Output height and width should be divisible by block size[0] "
+                                  "and block_size[1] respectively");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == input->dimension(0) * block_size[1]) &&
+                                      (output->dimension(1) == input->dimension(1) * block_size[0]),
+                                  "Output height and width should be equal to "
+                                  "input_height*blocksize[0] and input_width*blocksize[1] "
+                                  "respectively");
+
+  return Status{};
+}
+
+} // namespace
+
+CLBatchToSpaceNDKernel::CLBatchToSpaceNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLBatchToSpaceNDKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                       const int32_t *block_size)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE0=" + support::cpp11::to_string(block_size[0]));
+  build_opts.emplace("-DBLOCK_SIZE1=" + support::cpp11::to_string(block_size[1]));
+  build_opts.emplace("-DBATCH_OUT=" + support::cpp11::to_string(output->info()->dimension(3)));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("batch_to_space_nd", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLBatchToSpaceNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_out);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
new file mode 100644
index 000000000..3d2f2c702
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
+                           const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                        ICLTensor *output, BinaryLogicalOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "binary_logical_op";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+
+  int op_code = 0;
+  switch (op)
+  {
+    case BinaryLogicalOperation::AND:
+      op_code = 1;
+      break;
+    case BinaryLogicalOperation::OR:
+      op_code = 2;
+      break;
+    default:
+      throw std::runtime_error("Operation not supported, yet");
+  }
+
+  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLBinaryLogicalOpKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
index b019e8c33..bf7ebae3f 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -17,15 +17,8 @@
 #include "arm_compute/core/CL/kernels/CLCastKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
 
@@ -60,8 +53,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
   {
     const float scale_in = input->info()->quantization_info().scale;
     const int offset_in = input->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
 
     _kernel = static_cast<cl::Kernel>(
         CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts));
@@ -70,8 +63,8 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
   {
     const float scale_in = output->info()->quantization_info().scale;
     const int offset_in = output->info()->quantization_info().offset;
-    build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in));
-    build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in));
+    build_opts.emplace("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+    build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(offset_in));
 
     _kernel = static_cast<cl::Kernel>(
         CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts));
@@ -88,7 +81,7 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output)
   update_window_and_padding(win, input_access, output_access);
   output_access.set_valid_region(win, input->info()->valid_region());
 
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
new file mode 100644
index 000000000..5af5b16ea
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
+                          const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::U16,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::U16,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32, DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLComparisonOpKernel::CLComparisonOpKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                     ICLTensor *output, const ComparisonOperation &op)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "comparison_op";
+  int op_code = 0;
+
+  switch (op)
+  {
+    case ComparisonOperation::EQUAL:
+      op_code = 1;
+      break;
+    case ComparisonOperation::NOT_EQUAL:
+      op_code = 2;
+      break;
+    default:
+      throw std::runtime_error(" Operation not supported, yet");
+  }
+
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
+  build_opts.emplace(("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input1->info()->data_type())));
+  build_opts.emplace(
+      ("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized_asymmetric(input1->info()->data_type()) &&
+      ((input1->info()->quantization_info().offset != input2->info()->quantization_info().offset) ||
+       (input1->info()->quantization_info().scale != input2->info()->quantization_info().scale)))
+  {
+    build_opts.emplace("-DOFFSET_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().offset));
+    build_opts.emplace("-DOFFSET_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().scale));
+    kernel_name += "_qasymm8";
+  }
+
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input1->info()->data_type() == DataType::S16 ||
+        input2->info()->data_type() == DataType::S16)
+    {
+      set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if (input1->info()->data_type() == DataType::F16 &&
+             input2->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input1->info()->data_type() == DataType::F32 ||
+             input2->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLComparisonOpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLComparisonOpKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
new file mode 100644
index 000000000..c386e3312
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) == input->dimension(0) * block_size,
+                                  "Output width should be equal to (Input width * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) == input->dimension(1) * block_size,
+                                  "Output height should be equal to (Input height * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) == 0,
+                                  "Input depth should be divisible by (block size * block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      output->dimension(2) == input->dimension(2) / (block_size * block_size),
+      "Output depth should be equal to (Input depth / (block size * block size))");
+
+  return Status{};
+}
+} // namespace
+
+CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
new file mode 100644
index 000000000..0862b78bf
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
+    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *lookups)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const ICLTensor *lookups)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _input = input;
+  _output = output;
+  _lookups = lookups;
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "embedding_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
new file mode 100644
index 000000000..b1ee21bdc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLExpKernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+CLExpKernel::CLExpKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLExpKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Auto initialize output
+  auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("exp_layer", build_opts));
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLExpKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
index 23efafa6a..ae2801e2b 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -17,26 +17,14 @@
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
 
 using namespace arm_compute;
 
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int num_elems_processed_per_iteration = 1;
 
 Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
                           const ITensorInfo *output)
@@ -46,6 +34,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32,
                                                        DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
 
   return Status{};
 }
@@ -57,8 +46,7 @@ CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(n
 void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
 
   _input1 = input1;
   _input2 = input2;
@@ -89,11 +77,10 @@ void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
       static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   // Configure kernel window
-  const unsigned int num_elems_processed_per_iteration = 1;
   Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration));
   output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
new file mode 100644
index 000000000..cd7b21c6d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+  input_access.set_valid_region(win, output->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLHashtableLookupKernel::CLHashtableLookupKernel()
+    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+{
+}
+
+Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                         const ITensorInfo *input, const ITensorInfo *output,
+                                         const ITensorInfo *hits)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Output's shape was not set");
+
+  ARM_COMPUTE_ERROR_ON(lookups->dimensions(0) == hits->dimensions(0) &&
+                       output->dimension(output->num_dimensions() - 1) == lookups->dimension(0));
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1);
+
+  return Status{};
+}
+
+void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                        const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info()));
+
+  _lookups = lookups;
+  _keys = keys;
+  _input = input;
+  _output = output;
+  _hits = hits;
+
+  // Make _lookup_indices tensor
+  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+  _lookup_indices->allocator()->init(
+      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+  _lookup_indices->allocator()->allocate();
+
+  // Set kernel build options
+  std::stringstream kernel_name;
+  std::set<std::string> build_opts;
+  kernel_name << "hashtable_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+}
+
+void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const_cast<ICLTensor *>(_lookups)->map(queue);
+  const_cast<ICLTensor *>(_keys)->map(queue);
+  _hits->map(queue);
+  _lookup_indices->map(queue);
+
+  // Set values of hits
+  const int32_t *lookups_buf =
+      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+  const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
+  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+  int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
+
+  std::map<int32_t, size_t> key_map;
+  const size_t keys_num = _keys->info()->dimension(0);
+  for (size_t key_index = 0; key_index < keys_num; key_index++)
+  {
+    key_map[keys_buf[key_index]] = key_index;
+  }
+
+  const size_t lookups_num = _lookups->info()->dimension(0);
+  for (size_t i = 0; i < lookups_num; ++i)
+  {
+    const auto lookup_value = lookups_buf[i];
+    const auto it = key_map.find(lookup_value);
+    if (it != key_map.end())
+    {
+#if defined(DEBUG)
+      if (it->second >= lookups_num)
+        ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds.");
+#endif // defined(DEBUG)
+      lookup_indices_buf[i] = static_cast<int32_t>(it->second);
+      hits_buf[i] = static_cast<uint8_t>(1);
+    }
+    else
+    {
+      lookup_indices_buf[i] = -1;
+      hits_buf[i] = static_cast<uint8_t>(0);
+    }
+  }
+
+  const_cast<ICLTensor *>(_lookups)->unmap(queue);
+  const_cast<ICLTensor *>(_keys)->unmap(queue);
+  _hits->unmap(queue);
+  _lookup_indices->unmap(queue);
+
+  Window win = window.collapse(ICLKernel::window(), 2, 4);
+
+  Window win_lookup;
+  win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, win);
+    add_4D_tensor_argument(idx, _output, win);
+    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+    enqueue(queue, *this, win);
+  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
new file mode 100644
index 000000000..80d99dd3b
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32,
+                                                DataType::F16, DataType::F32);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(),
+                                              output->info()->tensor_shape());
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  return Status{};
+}
+
+} // namespace
+
+CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  _input = input;
+  _output = output;
+
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+
+  // Configure window
+  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, input->info()->valid_region());
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLNegKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
new file mode 100644
index 000000000..12bbe910f
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNormalizationLayerExKernel.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+  // Checks performed when output is configured
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                        NormalizationLayerInfo norm_info)
+{
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output, *input->clone());
+
+  const unsigned int norm_size = norm_info.norm_size();
+  bool is_in_map = norm_info.is_in_map();
+
+  const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
+  const BorderSize border_size = BorderSize(0, border_width);
+
+  const unsigned int num_elems_processed_per_iteration = 4;
+  const unsigned int num_elems_read_per_iteration =
+      is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2))
+                : num_elems_processed_per_iteration;
+
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+  // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside
+  // the kernel, avoiding padding
+  AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+  output_access.set_valid_region(win, input->valid_region());
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLNormalizationLayerExKernel::CLNormalizationLayerExKernel()
+    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
+{
+}
+
+BorderSize CLNormalizationLayerExKernel::border_size() const { return _border_size; }
+
+void CLNormalizationLayerExKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                             NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output->info(), *input->info()->clone());
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
+
+  _input = input;
+  _output = output;
+
+  const unsigned int num_elems_processed_per_iteration = 4;
+  const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
+  // Set build options
+  CLBuildOptions build_opts;
+  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.add_option(
+      ("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+  build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+  build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+  build_opts.add_option(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size())));
+  build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+  build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
+
+  // Create kernel
+  std::string kernel_name =
+      _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set config_id for enabling LWS tuning
+  _config_id = "normalization_layer_";
+  _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(
+      static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(norm_info.norm_size());
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(0));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLNormalizationLayerExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              NormalizationLayerInfo norm_info)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+
+  return Status{};
+}
+
+void CLNormalizationLayerExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
+  Window slice = window_collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+  } while (window_collapsed.slide_window_slice_3D(slice));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
new file mode 100644
index 000000000..241f8ae4d
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                       DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+                                                       DataType::QASYMM8);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+
+void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), alpha->info(), output->info()));
+
+  _input = input;
+  _alpha = alpha;
+  _output = output;
+
+  // Create kernel
+  std::string kernel_name = "prelu";
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+  {
+    build_opts.emplace("-DOFF_IN1=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+    build_opts.emplace("-DOFF_IN2=" +
+                       support::cpp11::to_string(alpha->info()->quantization_info().offset));
+    build_opts.emplace("-DOFF_OUT=" +
+                       support::cpp11::to_string(output->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN1=" +
+                       support::cpp11::to_string(input->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_IN2=" +
+                       support::cpp11::to_string(alpha->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_OUT=" +
+                       support::cpp11::to_string(output->info()->quantization_info().scale));
+    kernel_name += "_qasymm8";
+  }
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input->info()->data_type() == DataType::F32 ||
+             alpha->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+
+  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input->info()->tensor_shape();
+  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice_input1);
+    add_3D_tensor_argument(idx, _alpha, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLPReLUKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
new file mode 100644
index 000000000..99b54c822
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
+                          const ITensorInfo *pad_size_info)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() > 0 &&
+                                      input_info->num_dimensions() <= 4,
+                                  "Pad kernel supports upto 4-D input tensor");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input_info->num_dimensions() == output_info->num_dimensions(),
+      "output tensor should have same number of dimensions as input tensor");
+
+  if (input_info->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
+                                        output_info->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
+
+void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
+
+  _input = input;
+  _output = output;
+  _pad_size = pad_size;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
+  build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
+  build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
+  build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  _pad_size->map(queue);
+
+  // Padding values only for up, top, left and front are required based on the rank of tensor
+  int rank = _pad_size->info()->dimension(1);
+
+  auto pad_batch_up =
+      (rank == 4) ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0})) : 0;
+  auto pad_height_top =
+      (rank >= 2)
+          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 2) ? 0 : 1}))
+          : 0;
+  auto pad_width_left = (rank >= 1)
+                            ? *reinterpret_cast<const int32_t *>(
+                                  _pad_size->ptr_to_element({0, (rank == 4) ? 2 : rank - 1}))
+                            : 0;
+  auto pad_depth_front =
+      (rank >= 3)
+          ? *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, (rank == 3) ? 0 : 3}))
+          : 0;
+
+  _pad_size->unmap(queue);
+
+  // Pad_values which needs to be passed
+  const cl_int4 paddingValues = {
+      {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
+       static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    _kernel.setArg<cl_int4>(idx++, paddingValues);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
new file mode 100644
index 000000000..aa094761c
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPermuteExKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
+{
+  TensorShape output_shape = input->tensor_shape();
+  permute(output_shape, perm);
+  return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const PermutationVector &perm)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  const TensorShape output_shape =
+      misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+
+  // Validate configured output
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+  return Status{};
+}
+} // namespace
+
+CLPermuteExKernel::CLPermuteExKernel() : _input(nullptr), _output(nullptr), _perm() {}
+
+void CLPermuteExKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                  const PermutationVector &perm)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
+
+  _input = input;
+  _output = output;
+  _perm = perm;
+
+  const TensorShape output_shape = get_output_shape(input->info(), perm);
+  // Output auto inizialitation if not yet initialized
+  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+  // Create kernel
+  std::set<std::string> build_opts;
+
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+  // New positions of batch(D), height(H), width(w) and channel(C) based on permutation vector
+  build_opts.emplace("-DP1=" + support::cpp11::to_string(perm[0]));
+  build_opts.emplace("-DP2=" + support::cpp11::to_string(perm[1]));
+  build_opts.emplace("-DP3=" + support::cpp11::to_string(perm[2]));
+  build_opts.emplace("-DP4=" + support::cpp11::to_string(perm[3]));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("permute_generic", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLPermuteExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const PermutationVector &perm)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
+
+  return Status{};
+}
+
+void CLPermuteExKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
index a3e0163de..b985aa737 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp
@@ -17,20 +17,8 @@
 #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
 
 using namespace arm_compute;
 
@@ -45,12 +33,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
   ARM_COMPUTE_UNUSED(overflow_policy);
   ARM_COMPUTE_UNUSED(rounding_policy);
 
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8,
-                                                       DataType::QS16, DataType::S16, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8,
-                                                       DataType::QS16, DataType::S16, DataType::F16,
-                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
+                                                       DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
 
   const TensorShape &out_shape =
@@ -58,21 +44,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
                                   "Inputs are not broadcast compatible");
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
-
-  if (is_data_type_fixed_point(input1->data_type()))
-  {
-    // All data types must be all QS8 or all QS16
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1,
-                                    "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
-  }
 
   // Validate in case of configured output
   if (output->total_size() > 0)
   {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8,
-                                                         DataType::QS16, DataType::S16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
         output->data_type() == DataType::U8 &&
@@ -81,11 +57,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
         detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
         "Wrong shape for output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
-    if (is_data_type_fixed_point(input1->data_type()))
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-    }
   }
 
   return Status{};
@@ -191,14 +162,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
     {
       compute_type = "int";
     }
-    else if (input1->info()->data_type() == DataType::QS8)
-    {
-      compute_type = "qs8";
-    }
-    else if (input1->info()->data_type() == DataType::QS16)
-    {
-      compute_type = "qs16";
-    }
     else
     {
       compute_type = "ushort";
@@ -218,11 +181,6 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
           : "-DSATURATE");
   build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz"
                                                                   : "-DROUND=_rte");
-  if (is_data_type_fixed_point(input1->info()->data_type()))
-  {
-    build_opts.emplace("-DFIXED_POINT_POSITION=" +
-                       support::cpp11::to_string(input1->info()->fixed_point_position()));
-  }
   build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
   build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
   build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -245,7 +203,7 @@ void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTens
     _kernel.setArg(idx++, scale);
   }
 
-  ICLKernel::configure(win_config.second);
+  ICLKernel::configure_internal(win_config.second);
 }
 
 Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
deleted file mode 100644
index 168b246bf..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
-  // We can handle for simple case only
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(),
-                                    "Output same type allowed for input and output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1,
-                                    "Only support for output dimension 1");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2,
-                                    "Only support for input dimension 2");
-  }
-
-  return Status{};
-}
-
-} // namespace
-
-CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {}
-
-void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info()));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  // Configure kernel window
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  Window win;
-  win.set(0, Window::Dimension(0, cols, 1));
-  win.set(1, Window::Dimension(0, rows, 1));
-
-  // Construct kernel name
-  std::string kernel_name = "reduce_max";
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  ICLKernel::configure(win);
-}
-
-Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis,
-                                   const ITensorInfo *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output));
-
-  return Status{};
-}
-
-void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window window_input = window;
-  Window slice_input = window_input.first_slice_window_1D();
-
-  do
-  {
-    Window slice_output = slice_input.shift_dimensions(1);
-    unsigned int idx = 0;
-    add_1D_tensor_argument(idx, _input, slice_input);
-    add_1D_tensor_argument(idx, _output, slice_output);
-    enqueue(queue, *this, slice_input);
-
-  } while (window_input.slide_window_slice_1D(slice_input));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
new file mode 100644
index 000000000..f581780e1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+namespace
+{
+// NOTE This is necessary because it is not guaranteed that the axis positions of input and output
+// are the same.
+const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+{
+  TensorShape out_shape{input_shape};
+
+  out_shape.set(axis, 1);
+
+  return out_shape;
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+                          ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32, DataType::S32);
+  if (op == ReduceOperation::MEAN || op == ReduceOperation::SUM)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
+                                    "Not support QASYMM8, yet");
+  }
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+
+  const auto num_dimensions = input->tensor_shape().num_dimensions();
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      axis >= 0 && axis < num_dimensions,
+      "axis must be greater than or equal to 0 and less than (input's rank).");
+
+  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+                                  "output shape's size does not match axis");
+
+  return Status{};
+}
+} // namespace
+
+CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+
+void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                        const uint32_t axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+
+  _input = input;
+  _output = output;
+  _axis = axis;
+
+  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+
+  // Construct kernel name
+  std::string kernel_name;
+  int op_code = 0;
+  if (op == ReduceOperation::MAX)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 1;
+  }
+  else if (op == ReduceOperation::MIN)
+  {
+    kernel_name = "reduce_min_max";
+    op_code = 2;
+  }
+  else if (op == ReduceOperation::SUM)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 3;
+  }
+  else if (op == ReduceOperation::MEAN)
+  {
+    kernel_name = "reduce_sum_mean";
+    op_code = 4;
+  }
+  else
+    throw std::runtime_error("Operation not supported, yet");
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output_info, Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output_info->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                         const uint32_t axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+
+  return Status{};
+}
+
+void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &shape_in = _input->info()->tensor_shape();
+
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+
+  _kernel.setArg<cl_int>(idx++, _axis);
+  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+
+  // Support dimensions up to 4
+  Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
+
+  // Setup input slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Copy output's shape in order to use for recovering at end of this method
+  // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
+  // of input and output are the same
+  const TensorShape shape_out = _output->info()->tensor_shape();
+  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+
+  idx = 0;
+  add_4D_tensor_argument(idx, _input, slice_in);
+  add_4D_tensor_argument(idx, _output, slice_out);
+  enqueue(queue, *this, slice_out);
+
+  // Recover output's shape of output tensor
+  _output->info()->set_tensor_shape(shape_out);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
deleted file mode 100644
index 84a77122d..000000000
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions,
-                                  "Reduction axis greater than max number of dimensions");
-
-  std::vector<uint32_t>::const_iterator it;
-  bool axis_w = false;
-  bool axis_h = false;
-  for (it = axis.begin(); it != axis.end(); ++it)
-  {
-    if ((*it) == 0)
-    {
-      axis_w = true;
-    }
-    else if ((*it) == 1)
-    {
-      axis_h = true;
-    }
-    else
-    {
-      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
-    }
-  }
-  // TODO Other axises (currently, only axises for both width and height are supported.)
-  if (!axis_w || !axis_h)
-  {
-    ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!");
-  }
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                         std::vector<uint32_t> axis)
-{
-  // Output tensor auto initialization if not yet initialized
-  TensorShape output_shape{input->tensor_shape()};
-  output_shape.set(0, 1);
-  output_shape.set(1, 1);
-  auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(),
-                     input->fixed_point_position());
-
-  // Configure kernel window
-  constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
-  const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-
-  Window win = calculate_max_window(
-      *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-  AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
-                                     num_elems_processed_per_iteration_y);
-  AccessWindowHorizontal output_access(output, 0, 1);
-  bool window_changed = update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, output->valid_region());
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-
-  return std::make_tuple(err, win);
-}
-} // namespace
-
-CLReductionMeanKernel::CLReductionMeanKernel()
-    : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size()
-{
-}
-
-BorderSize CLReductionMeanKernel::border_size() const { return _border_size; }
-
-void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                      std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis));
-
-  _input = input;
-  _output = output;
-  _reduction_axis = axis;
-
-  constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step
-
-  // Set border size
-  _border_size = BorderSize(
-      ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) -
-      input->info()->dimension(0));
-
-  // Set build options
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  // build_opts.emplace(("-DVEC_SIZE=" +
-  // support::cpp11::to_string(num_elems_processed_per_iteration)));
-  if (is_data_type_fixed_point(input->info()->data_type()))
-  {
-    build_opts.emplace("-DFIXED_POINT_POSITION=" +
-                       support::cpp11::to_string(input->info()->fixed_point_position()));
-  }
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts));
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
-
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  ICLKernel::configure(std::get<1>(win_config));
-}
-
-Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                       std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
-      validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
-
-  return Status{};
-}
-
-void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  // Set out window
-  Window out_window(window);
-  out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-  // Get first input and output slices
-  Window in_slice = window.first_slice_window_2D();
-  Window out_slice = out_window.first_slice_window_2D();
-
-  // Set local sums buffer
-  // TODO work_group
-  unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
-
-  unsigned int idx = 2 * num_arguments_per_2D_tensor();
-  _kernel.setArg(idx++, local_sum_size, nullptr);
-  _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height
-  _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) *
-                                                    _input->info()->dimension(1))); // divider
-
-  do
-  {
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input, in_slice);
-    in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
-    add_2D_tensor_argument(idx, _output, out_slice);
-    enqueue(queue, *this, in_slice);
-  } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644
index 000000000..6b0697e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+                          const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+                                  "The number of dimensions of input should be equal to output");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+                                  "The input and output layouts are different!");
+
+  // TODO Support other cases
+  if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+                                  "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+  if (input->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                       const ICLTensor *padding_size, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+  _input = input;
+  _block_size = block_size;
+  _padding_size = padding_size;
+  _output = output;
+
+  // Set kernel build options
+  // TODO Support other cases
+  std::string kernel_name = "space_to_batch_4d";
+  std::set<std::string> build_opts;
+  Window win;
+
+  if (input->info()->data_layout() == DataLayout::NCHW)
+  {
+    kernel_name += "_nchw";
+    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+    win = calculate_max_window(*output->info(), Steps());
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+  }
+  else if (input->info()->data_layout() == DataLayout::NHWC)
+  {
+    kernel_name += "_nhwc";
+    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    input_access.set_valid_region(win, output->info()->valid_region());
+
+    if (window_changed)
+    {
+      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported layout");
+  }
+
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure kernel window
+  ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(DEBUG)
+  const_cast<ICLTensor *>(_block_size)->map(queue);
+  const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+  const size_t num_dimensions = _input->info()->num_dimensions();
+  const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+  int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+  for (size_t i = 0; i < num_spacial_dimensions; ++i)
+  {
+    const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+    const int32_t padding_size_pre =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+    const int32_t padding_size_post =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+    ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+    ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+                             "Padding size should be greater than or equal to 0");
+
+    if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(i) !=
+              (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+    else
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) !=
+              (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) +
+               padding_size_pre + padding_size_post) /
+                  block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+
+    batch_size *= block_size;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(
+      _output->info()->dimension(num_dimensions - 1) != batch_size,
+      "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+  const_cast<ICLTensor *>(_block_size)->unmap(queue);
+  const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(DEBUG)
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Set block size window
+  Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+  // Set padding size window
+  Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    add_1D_tensor_argument(idx, _block_size, win_block);
+    add_2D_tensor_argument(idx, _padding_size, win_padding);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 000000000..5d6329edc
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
+                                  "Input batch should be equal to Output batch");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input->dimension(2) * block_size * block_size == output->dimension(2),
+      "Output depth should be equal to (input depth * block size *block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
+                                      !(input->dimension(1) % block_size),
+                                  "Input height and width should be divisible by block size");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) == (input->dimension(0) / block_size)) &&
+                                      (output->dimension(1) == (input->dimension(1) / block_size)),
+                                  "Output height and width should be equal to "
+                                  "input_height/blocksize and input_width/blocksize respectively");
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
new file mode 100644
index 000000000..260bc39f1
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSquaredDifferenceKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+  const TensorShape &out_shape =
+      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16, DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+                                  "Inputs are not broadcast compatible");
+  // Validate in case of configured output
+  if (output->total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+        "Wrong shape for output");
+  }
+  return Status{};
+}
+} // namespace
+
+CLSquaredDifferenceKernel::CLSquaredDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLSquaredDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2,
+                                          ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input1->info(), input2->info(), output->info()));
+
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  // Create kernel
+  std::set<std::string> build_opts;
+  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())));
+  build_opts.emplace(
+      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("squared_difference", build_opts));
+
+  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+
+  const TensorShape &out_shape = broadcast_pair.first;
+  const ValidRegion &valid_region = broadcast_pair.second;
+
+  // Auto initialize output if not initialized
+  {
+    set_shape_if_empty(*output->info(), out_shape);
+
+    if (input1->info()->data_type() == DataType::F16 &&
+        input2->info()->data_type() == DataType::F16)
+    {
+      set_format_if_unknown(*output->info(), Format::F16);
+    }
+    else if (input1->info()->data_type() == DataType::F32 ||
+             input2->info()->data_type() == DataType::F32)
+    {
+      set_format_if_unknown(*output->info(), Format::F32);
+    }
+  }
+
+  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+  Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info());
+  Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info());
+
+  AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                        update_window_and_padding(win_input2, input2_access) ||
+                        update_window_and_padding(win, output_access);
+
+  output_access.set_valid_region(win, valid_region);
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLSquaredDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+  const TensorShape &out_shape = _output->info()->tensor_shape();
+
+  bool can_collapse = true;
+  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+  {
+    can_collapse =
+        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+    {
+      can_collapse = (in_shape1[d] == in_shape2[d]);
+    }
+  }
+
+  bool has_collapsed = false;
+  Window collapsed =
+      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                   : window;
+
+  const TensorShape &in_shape1_collapsed =
+      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+  const TensorShape &in_shape2_collapsed =
+      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+  Window slice = collapsed.first_slice_window_3D();
+  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input1, slice_input1);
+    add_3D_tensor_argument(idx, _input2, slice_input2);
+    add_3D_tensor_argument(idx, _output, slice);
+
+    enqueue(queue, *this, slice);
+
+    collapsed.slide_window_slice_3D(slice_input1);
+    collapsed.slide_window_slice_3D(slice_input2);
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLSquaredDifferenceKernel::border_size() const
+{
+  const unsigned int replicateSize =
+      _output->info()->dimension(0) -
+      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+  const unsigned int border =
+      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+  return BorderSize(0, border, 0, 0);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
index 80ffd423a..48146a43a 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceExKernel.cpp
@@ -14,43 +14,30 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 
-#include <string>
-
-using namespace std;
 using namespace arm_compute;
 
-static const int32_t maxDim = 4;
-
-CLStridedSliceKernel::CLStridedSliceKernel()
+CLStridedSliceExKernel::CLStridedSliceExKernel()
     : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr),
       _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0)
 {
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const ITensorInfo *begin, const ITensorInfo *end,
-                                      const ITensorInfo *strides, int32_t beginMask,
-                                      int32_t endMask, int32_t shrinkAxisMask)
+Status CLStridedSliceExKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        const ITensorInfo *begin, const ITensorInfo *end,
+                                        const ITensorInfo *strides, int32_t beginMask,
+                                        int32_t endMask, int32_t shrinkAxisMask)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16,
-      DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32);
@@ -153,15 +140,6 @@ inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride,
   return stop;
 }
 
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
-  int32_t offset = b * shape[2] * shape[1] * shape[0];
-  offset += d * shape[1] * shape[0];
-  offset += h * shape[0];
-  offset += w;
-  return offset;
-}
-
 inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
 {
   int32_t ret = 0;
@@ -177,10 +155,10 @@ inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
   return ret;
 }
 
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     ICLTensor *beginData, ICLTensor *endData,
-                                     ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                                     int32_t shrinkAxisMask)
+void CLStridedSliceExKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                       ICLTensor *beginData, ICLTensor *endData,
+                                       ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+                                       int32_t shrinkAxisMask)
 {
   ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(),
                                       endData->info(), stridesData->info(), beginMask, endMask,
@@ -195,48 +173,31 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
   _endMask = endMask;
   _shrinkAxisMask = shrinkAxisMask;
 
-  constexpr unsigned int num_elems_processed_per_iteration = 1;
-
   // Set kernel build options
   std::set<std::string> build_opts;
   build_opts.emplace("-DELEMENT_DATA_TYPE=" +
                      get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
 
   // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("strided_slice_ex", build_opts));
 
-  // Create output's window without padding
-  TensorShape collapsed = output->info()->tensor_shape();
-  collapsed.collapse(4);
-  TensorInfo info = *output->info();
-  info.set_tensor_shape(collapsed);
-  Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration));
-
-  ICLKernel::configure(win);
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+  ICLKernel::configure_internal(win);
 }
 
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceExKernel::run(const Window &window, cl::CommandQueue &queue)
 {
   ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
   ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-  // Create input window
-  TensorShape collapsed = _input->info()->tensor_shape();
-  collapsed.collapse(4);
-  TensorInfo info = *_input->info();
-  info.set_tensor_shape(collapsed);
-  Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size()));
-
   _beginData->map(queue);
   _endData->map(queue);
   _stridesData->map(queue);
 
-  std::vector<int32_t> dimsIn;
-  std::vector<int32_t> dimsOut;
   std::vector<int32_t> starts;
-  std::vector<int32_t> stops;
   std::vector<int32_t> strides;
 
   for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n)
@@ -246,22 +207,13 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
         StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n],
                      reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n));
 
-    stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n],
-                                   reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape,
-                                   n));
-
     strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]);
-    dimsIn.emplace_back(shape[n]);
-    dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n]));
   }
 
   for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++)
   {
     starts.emplace_back(0);
-    stops.emplace_back(1);
     strides.emplace_back(1);
-    dimsIn.emplace_back(1);
-    dimsOut.emplace_back(1);
   }
   // TODO: Apply shrinkAxisMask
 
@@ -269,20 +221,7 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
   _stridesData->unmap(queue);
   _endData->unmap(queue);
 
-  // Set parameters
-  unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-  const cl_int4 dimsInArg = {{
-      static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]),
-      static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, dimsInArg);
-
-  const cl_int4 dimsOutArg = {{
-      static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]),
-      static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]),
-  }};
-  _kernel.setArg<cl_int4>(idx++, dimsOutArg);
-
+  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
   const cl_int4 startsArg = {{
       static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]),
       static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]),
@@ -295,10 +234,20 @@ void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
   }};
   _kernel.setArg<cl_int4>(idx++, stridesArg);
 
-  // TODO: Apply slicing output's window
-  idx = 0;
-  add_1D_tensor_argument(idx, _input, win_in);
-  add_1D_tensor_argument(idx, _output, window);
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
 
-  enqueue(queue, *this, window);
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
index d95b485b7..073c2f7bb 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -17,15 +17,8 @@
 #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <climits>
-#include <cassert>
 
 namespace arm_compute
 {
@@ -59,7 +52,7 @@ void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTens
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, 1, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue)
@@ -102,7 +95,7 @@ void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffe
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue)
@@ -147,7 +140,7 @@ void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n)
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -192,7 +185,7 @@ void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -236,7 +229,7 @@ void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buf
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -275,7 +268,7 @@ void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue)
@@ -322,7 +315,7 @@ void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n)
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue)
@@ -365,7 +358,7 @@ void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, in
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue)
@@ -404,7 +397,7 @@ void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, n, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue)
@@ -449,7 +442,7 @@ void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int
   // Configure kernel window
   Window win;
   win.set(0, Window::Dimension(0, k, 1));
-  ICLKernel::configure(win);
+  ICLKernel::configure_internal(win);
 }
 
 void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf)